[openib-general] [PATCH/RFC] libibverbs and libmthca fork support

Roland Dreier rdreier at cisco.com
Tue Aug 1 07:21:15 PDT 2006


 > You forgot to include buf.c in the patch.

Oops, forgot to do svn add before generating the diff.  Updated diff below:

Index: libibverbs/include/infiniband/driver.h
===================================================================
--- libibverbs/include/infiniband/driver.h	(revision 8793)
+++ libibverbs/include/infiniband/driver.h	(working copy)
@@ -135,6 +135,9 @@ int ibv_cmd_destroy_ah(struct ibv_ah *ah
 int ibv_cmd_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 int ibv_cmd_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+int ibv_dontfork_range(void *base, size_t size);
+int ibv_dofork_range(void *base, size_t size);
+
 /*
  * sysfs helper functions
  */
Index: libibverbs/include/infiniband/verbs.h
===================================================================
--- libibverbs/include/infiniband/verbs.h	(revision 8793)
+++ libibverbs/include/infiniband/verbs.h	(working copy)
@@ -285,6 +285,8 @@ struct ibv_pd {
 struct ibv_mr {
 	struct ibv_context     *context;
 	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
 	uint32_t		handle;
 	uint32_t		lkey;
 	uint32_t		rkey;
@@ -1016,6 +1018,14 @@ int ibv_attach_mcast(struct ibv_qp *qp, 
  */
 int ibv_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+/**
+ * ibv_fork_init - Prepare data structures so that fork() may be used
+ * safely.  If this function is not called or returns a non-zero
+ * status, then libibverbs data structures are not fork()-safe and the
+ * effect of an application calling fork() is undefined.
+ */
+int ibv_fork_init(void);
+
 END_C_DECLS
 
 #  undef __attribute_const
Index: libibverbs/ChangeLog
===================================================================
--- libibverbs/ChangeLog	(revision 8793)
+++ libibverbs/ChangeLog	(working copy)
@@ -1,3 +1,29 @@
+2006-07-26  Roland Dreier  <rdreier at cisco.com>
+
+	* src/verbs.c (ibv_reg_mr, ibv_dereg_mr): Add calls to
+	ibv_dontfork_range() and ibv_dofork_range() for memory regions
+	registered by library consumers.
+
+	* include/infiniband/verbs.h: Add declaration of ibv_fork_init().
+
+	* include/infiniband/driver.h: Add declarations of
+	ibv_dontfork_range() and ibv_dofork_range().
+
+	* src/memory.c: Rewrite to use a red-black tree instead of a
+	linked list.  Change from doing mlock()/munlock() to
+	madvise(..., MADV_DONTFORK) and madvise(..., MADV_DOFORK), and
+	change the name of the entry points to ibv_dontfork_range() and
+	ibv_dofork_range().  Add ibv_fork_init() for applications to
+	request fork-safe behavior.
+
+	* src/ibverbs.h: Kill off unused declarations.
+
+	* src/init.c (ibverbs_init): Get rid of call to ibv_init_mem_map().
+
+	* include/infiniband/verbs.h: Add addr and length field to struct
+	ibv_mr so that memory regions can be madvised().  This changes the
+	ABI, since the layout of struct ibv_mr is changed.
+
 2006-07-04  Roland Dreier  <rdreier at cisco.com>
 
 	* include/infiniband/arch.h: Fix typo in sparc mb()
Index: libibverbs/src/libibverbs.map
===================================================================
--- libibverbs/src/libibverbs.map	(revision 8793)
+++ libibverbs/src/libibverbs.map	(working copy)
@@ -74,6 +74,9 @@ IBVERBS_1.0 {
 		mult_to_ibv_rate;
 		ibv_get_sysfs_path;
 		ibv_read_sysfs_file;
+		ibv_fork_init;
+		ibv_dontfork_range;
+		ibv_dofork_range;
 
 	local: *;
 };
Index: libibverbs/src/ibverbs.h
===================================================================
--- libibverbs/src/ibverbs.h	(revision 8793)
+++ libibverbs/src/ibverbs.h	(working copy)
@@ -58,11 +58,7 @@ struct ibv_abi_compat_v2 {
 
 extern HIDDEN int abi_ver;
 
-extern HIDDEN int ibverbs_init(struct ibv_device ***list);
-
-extern HIDDEN int ibv_init_mem_map(void);
-extern HIDDEN int ibv_lock_range(void *base, size_t size);
-extern HIDDEN int ibv_unlock_range(void *base, size_t size);
+HIDDEN int ibverbs_init(struct ibv_device ***list);
 
 #define IBV_INIT_CMD(cmd, size, opcode)					\
 	do {								\
Index: libibverbs/src/verbs.c
===================================================================
--- libibverbs/src/verbs.c	(revision 8793)
+++ libibverbs/src/verbs.c	(working copy)
@@ -155,18 +155,32 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd 
 {
 	struct ibv_mr *mr;
 
+	if (ibv_dontfork_range(addr, length))
+		return NULL;
+
 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
 	if (mr) {
 		mr->context = pd->context;
 		mr->pd      = pd;
-	}
+		mr->addr    = addr;
+		mr->length  = length;
+	} else
+		ibv_dofork_range(addr, length);
 
 	return mr;
 }
 
 int ibv_dereg_mr(struct ibv_mr *mr)
 {
-	return mr->context->ops.dereg_mr(mr);
+	int ret;
+	void *addr	= mr->addr;
+	size_t length	= mr->length;
+
+	ret = mr->context->ops.dereg_mr(mr);
+	if (!ret)
+		ibv_dofork_range(addr, length);
+
+	return ret;
 }
 
 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
Index: libibverbs/src/init.c
===================================================================
--- libibverbs/src/init.c	(revision 8793)
+++ libibverbs/src/init.c	(working copy)
@@ -205,9 +205,6 @@ HIDDEN int ibverbs_init(struct ibv_devic
 
 	*list = NULL;
 
-	if (ibv_init_mem_map())
-		return 0;
-
 	find_drivers(default_path);
 
 	/*
Index: libibverbs/src/memory.c
===================================================================
--- libibverbs/src/memory.c	(revision 8793)
+++ libibverbs/src/memory.c	(working copy)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -36,6 +37,7 @@
 #  include <config.h>
 #endif /* HAVE_CONFIG_H */
 
+#include <errno.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -44,114 +46,424 @@
 #include "ibverbs.h"
 
 /*
- * We keep a linked list of page ranges that have been locked along with a
- * reference count to manage overlapping registrations, etc.
- *
- * Eventually we should turn this into an RB-tree or something similar
- * to avoid the O(n) cost of registering/unregistering memory.
+ * Most distro's headers don't have these yet.
  */
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK	10
+#endif
+
+#ifndef MADV_DOFORK
+#define MADV_DOFORK	11
+#endif
 
 struct ibv_mem_node {
-	struct ibv_mem_node *prev, *next;
-	uintptr_t            start, end;
-	int                  refcnt;
+	enum {
+		IBV_RED,
+		IBV_BLACK
+	}			color;
+	struct ibv_mem_node    *parent;
+	struct ibv_mem_node    *left, *right;
+	uintptr_t		start, end;
+	int			refcnt;
 };
 
-static struct {
-	struct ibv_mem_node *first;
-	pthread_mutex_t      mutex;
-	uintptr_t            page_size;
-} mem_map;
+static struct ibv_mem_node *mm_root;
+static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
+static int page_size;
+static int too_late;
 
-int ibv_init_mem_map(void)
+int ibv_fork_init(void)
 {
-	struct ibv_mem_node *node = NULL;
-
-	node = malloc(sizeof *node);
-	if (!node)
-		goto fail;
-
-	node->prev   = node->next = NULL;
-	node->start  = 0;
-	node->end    = UINTPTR_MAX;
-	node->refcnt = 0;
+	void *tmp;
 
-	mem_map.first = node;
+	if (mm_root)
+		return 0;
 
-	mem_map.page_size = sysconf(_SC_PAGESIZE);
-	if (mem_map.page_size < 0)
-		goto fail;
+	if (too_late)
+		return EINVAL;
 
-	if (pthread_mutex_init(&mem_map.mutex, NULL))
-		goto fail;
+	page_size = sysconf(_SC_PAGESIZE);
+	if (page_size < 0)
+		return errno;
+
+	if (posix_memalign(&tmp, page_size, page_size))
+		return ENOMEM;
+
+	if (madvise(tmp, page_size, MADV_DONTFORK) ||
+	    madvise(tmp, page_size, MADV_DOFORK))
+		return ENOSYS;
+
+	free(tmp);
+
+	mm_root = malloc(sizeof *mm_root);
+	if (!mm_root)
+		return ENOMEM;
+
+	mm_root->parent = NULL;
+	mm_root->left   = NULL;
+	mm_root->right  = NULL;
+	mm_root->color  = IBV_BLACK;
+	mm_root->start  = 0;
+	mm_root->end    = UINTPTR_MAX;
+	mm_root->refcnt = 0;
 
 	return 0;
+}
 
-fail:
-	if (node)
-		free(node);
+static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+{
+	if (node->left) {
+		node = node->left;
+		while (node->right)
+			node = node->right;
+	} else {
+		while (node->parent && node == node->parent->left)
+			node = node->parent;
 
-	return -1;
+		node = node->parent;
+	}
+
+	return node;
 }
 
-static struct ibv_mem_node *__mm_find_first(uintptr_t start, uintptr_t end)
+static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
 {
-	struct ibv_mem_node *node = mem_map.first;
+	if (node->right) {
+		node = node->right;
+		while (node->left)
+			node = node->left;
+	} else {
+		while (node->parent && node == node->parent->right)
+			node = node->parent;
 
-	while (node) {
-		if ((node->start <= start && node->end >= start) ||
-		    (node->start <= end   && node->end >= end))
-			break;
-		node = node->next;
+		node = node->parent;
 	}
 
 	return node;
 }
 
-static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+static void __mm_rotate_right(struct ibv_mem_node *node)
 {
-	return node->prev;
+	struct ibv_mem_node *tmp;
+
+	tmp = node->left;
+
+	node->left = tmp->right;
+	if (node->left)
+		node->left->parent = node;
+
+	if (node->parent) {
+		if (node->parent->right == node)
+			node->parent->right = tmp;
+		else
+			node->parent->left = tmp;
+	} else
+		mm_root = tmp;
+
+	tmp->parent = node->parent;
+
+	tmp->right = node;
+	node->parent = tmp;
 }
 
-static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
+static void __mm_rotate_left(struct ibv_mem_node *node)
+{
+	struct ibv_mem_node *tmp;
+
+	tmp = node->right;
+
+	node->right = tmp->left;
+	if (node->right)
+		node->right->parent = node;
+
+	if (node->parent) {
+		if (node->parent->right == node)
+			node->parent->right = tmp;
+		else
+			node->parent->left = tmp;
+	} else
+		mm_root = tmp;
+
+	tmp->parent = node->parent;
+
+	tmp->left = node;
+	node->parent = tmp;
+}
+
+static int verify(struct ibv_mem_node *node)
+{
+	int hl, hr;
+
+	if (!node)
+		return 1;
+
+	hl = verify(node->left);
+	hr = verify(node->left);
+
+	if (!hl || !hr)
+		return 0;
+	if (hl != hr)
+		return 0;
+
+	if (node->color == IBV_RED) {
+		if (node->left && node->left->color != IBV_BLACK)
+			return 0;
+		if (node->right && node->right->color != IBV_BLACK)
+			return 0;
+		return hl;
+	}
+
+	return hl + 1;
+}
+
+static void __mm_add_rebalance(struct ibv_mem_node *node)
 {
-	return node->next;
+	struct ibv_mem_node *parent, *gp, *uncle;
+
+	while (node->parent && node->parent->color == IBV_RED) {
+		parent = node->parent;
+		gp     = node->parent->parent;
+
+		if (parent == gp->left) {
+			uncle = gp->right;
+
+			if (uncle && uncle->color == IBV_RED) {
+				parent->color = IBV_BLACK;
+				uncle->color  = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				node = gp;
+			} else {
+				if (node == parent->right) {
+					__mm_rotate_left(parent);
+					node   = parent;
+					parent = node->parent;
+				}
+
+				parent->color = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				__mm_rotate_right(gp);
+			}
+		} else {
+			uncle = gp->left;
+
+			if (uncle && uncle->color == IBV_RED) {
+				parent->color = IBV_BLACK;
+				uncle->color  = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				node = gp;
+			} else {
+				if (node == parent->left) {
+					__mm_rotate_right(parent);
+					node   = parent;
+					parent = node->parent;
+				}
+
+				parent->color = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				__mm_rotate_left(gp);
+			}
+		}
+	}
+
+	mm_root->color = IBV_BLACK;
 }
 
-static void __mm_add(struct ibv_mem_node *node,
-		     struct ibv_mem_node *new)
+static void __mm_add(struct ibv_mem_node *new)
 {
-	new->prev  = node;
-	new->next  = node->next;
-	node->next = new;
-	if (new->next)
-		new->next->prev = new;
+	struct ibv_mem_node *node, *parent = NULL;
+
+	node = mm_root;
+	while (node) {
+		parent = node;
+		if (node->start < new->start)
+			node = node->right;
+		else
+			node = node->left;
+	}
+
+	if (parent->start < new->start)
+		parent->right = new;
+	else
+		parent->left = new;
+
+	new->parent = parent;
+	new->left   = NULL;
+	new->right  = NULL;
+
+	new->color = IBV_RED;
+	__mm_add_rebalance(new);
 }
 
 static void __mm_remove(struct ibv_mem_node *node)
 {
-	/* Never have to remove the first node, so we can use prev */
-	node->prev->next = node->next;
-	if (node->next)
-		node->next->prev = node->prev;
+	struct ibv_mem_node *child, *parent, *sib, *tmp;
+	int nodecol;
+
+	if (node->left && node->right) {
+		tmp = node->left;
+		while (tmp->right)
+			tmp = tmp->right;
+
+		nodecol    = tmp->color;
+		child      = tmp->left;
+		tmp->color = node->color;
+
+		if (tmp->parent != node) {
+			parent        = tmp->parent;
+			parent->right = tmp->left;
+			if (tmp->left)
+				tmp->left->parent = parent;
+
+			tmp->left   	   = node->left;
+			node->left->parent = tmp;
+		} else
+			parent = tmp;
+
+		tmp->right          = node->right;
+		node->right->parent = tmp;
+
+		tmp->parent = node->parent;
+		if (node->parent) {
+			if (node->parent->left == node)
+				node->parent->left = tmp;
+			else
+				node->parent->right = tmp;
+		} else
+			mm_root = tmp;
+	} else {
+		nodecol = node->color;
+
+		child  = node->left ? node->left : node->right;
+		parent = node->parent;
+
+		if (child)
+			child->parent = parent;
+		if (parent) {
+			if (parent->left == node)
+				parent->left = child;
+			else
+				parent->right = child;
+		} else
+			mm_root = child;
+	}
+
+	free(node);
+
+	if (nodecol == IBV_RED)
+		return;
+
+	while ((!child || child->color == IBV_BLACK) && child != mm_root) {
+		if (parent->left == child) {
+			sib = parent->right;
+
+			if (sib->color == IBV_RED) {
+				parent->color = IBV_RED;
+				sib->color    = IBV_BLACK;
+				__mm_rotate_left(parent);
+				sib = parent->right;
+			}
+
+			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
+			    (!sib->right || sib->right->color == IBV_BLACK)) {
+				sib->color = IBV_RED;
+				child  = parent;
+				parent = child->parent;
+			} else {
+				if (!sib->right || sib->right->color == IBV_BLACK) {
+					if (sib->left)
+						sib->left->color = IBV_BLACK;
+					sib->color = IBV_RED;
+					__mm_rotate_right(sib);
+					sib = parent->right;
+				}
+
+				sib->color    = parent->color;
+				parent->color = IBV_BLACK;
+				if (sib->right)
+					sib->right->color = IBV_BLACK;
+				__mm_rotate_left(parent);
+				child = mm_root;
+				break;
+			}
+		} else {
+			sib = parent->left;
+
+			if (sib->color == IBV_RED) {
+				parent->color = IBV_RED;
+				sib->color    = IBV_BLACK;
+				__mm_rotate_right(parent);
+				sib = parent->left;
+			}
+
+			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
+			    (!sib->right || sib->right->color == IBV_BLACK)) {
+				sib->color = IBV_RED;
+				child  = parent;
+				parent = child->parent;
+			} else {
+				if (!sib->left || sib->left->color == IBV_BLACK) {
+					if (sib->right)
+						sib->right->color = IBV_BLACK;
+					sib->color = IBV_RED;
+					__mm_rotate_left(sib);
+					sib = parent->left;
+				}
+
+				sib->color    = parent->color;
+				parent->color = IBV_BLACK;
+				if (sib->left)
+					sib->left->color = IBV_BLACK;
+				__mm_rotate_right(parent);
+				child = mm_root;
+				break;
+			}
+		}
+	}
+
+	if (child)
+		child->color = IBV_BLACK;
+}
+
+static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
+{
+	struct ibv_mem_node *node = mm_root;
+
+	while (node) {
+		if (node->start <= start && node->end >= start)
+			break;
+
+		if (node->start < start)
+			node = node->right;
+		else
+			node = node->left;
+	}
+
+	return node;
 }
 
-int ibv_lock_range(void *base, size_t size)
+static int ibv_madvise_range(void *base, size_t size, int advice)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
+	int inc;
 	int ret = 0;
 
 	if (!size)
 		return 0;
 
-	start = (uintptr_t) base & ~(mem_map.page_size - 1);
-	end   = ((uintptr_t) (base + size + mem_map.page_size - 1) &
-		 ~(mem_map.page_size - 1)) - 1;
+	inc = advice == MADV_DONTFORK ? 1 : -1;
+
+	start = (uintptr_t) base & ~(page_size - 1);
+	end   = ((uintptr_t) (base + size + page_size - 1) &
+		 ~(page_size - 1)) - 1;
 
-	pthread_mutex_lock(&mem_map.mutex);
+	pthread_mutex_lock(&mm_mutex);
 
-	node = __mm_find_first(start, end);
+	node = __mm_find_start(start, end);
 
 	if (node->start < start) {
 		tmp = malloc(sizeof *tmp);
@@ -165,11 +477,19 @@ int ibv_lock_range(void *base, size_t si
 		tmp->refcnt = node->refcnt;
 		node->end   = start - 1;
 
-		__mm_add(node, tmp);
+		__mm_add(tmp);
 		node = tmp;
+	} else {
+		tmp = __mm_prev(node);
+		if (tmp && tmp->refcnt == node->refcnt + inc) {
+			tmp->end = node->end;
+			tmp->refcnt = node->refcnt;
+			__mm_remove(node);
+			node = tmp;
+		}
 	}
 
-	while (node->start <= end) {
+	while (node && node->start <= end) {
 		if (node->end > end) {
 			tmp = malloc(sizeof *tmp);
 			if (!tmp) {
@@ -182,13 +502,16 @@ int ibv_lock_range(void *base, size_t si
 			tmp->refcnt = node->refcnt;
 			node->end   = end;
 
-			__mm_add(node, tmp);
+			__mm_add(tmp);
 		}
 
+		node->refcnt += inc;
 
-		if (node->refcnt++ == 0) {
-			ret = mlock((void *) node->start,
-				    node->end - node->start + 1);
+		if ((inc == -1 && node->refcnt == 0) ||
+		    (inc ==  1 && node->refcnt == 1)) {
+			ret = madvise((void *) node->start,
+				      node->end - node->start + 1,
+				      advice);
 			if (ret)
 				goto out;
 		}
@@ -196,63 +519,36 @@ int ibv_lock_range(void *base, size_t si
 		node = __mm_next(node);
 	}
 
+	if (node) {
+		tmp = __mm_prev(node);
+		if (tmp && node->refcnt == tmp->refcnt) {
+			tmp->end = node->end;
+			__mm_remove(node);
+		}
+	}
+
 out:
-	pthread_mutex_unlock(&mem_map.mutex);
+	pthread_mutex_unlock(&mm_mutex);
 
 	return ret;
 }
 
-int ibv_unlock_range(void *base, size_t size)
+int ibv_dontfork_range(void *base, size_t size)
 {
-	uintptr_t start, end;
-	struct ibv_mem_node *node, *tmp;
-	int ret = 0;
-
-	if (!size)
+	if (mm_root)
+		return ibv_madvise_range(base, size, MADV_DONTFORK);
+	else {
+		too_late = 1;
 		return 0;
-
-	start = (uintptr_t) base & ~(mem_map.page_size - 1);
-	end   = ((uintptr_t) (base + size + mem_map.page_size - 1) &
-		 ~(mem_map.page_size - 1)) - 1;
-
-	pthread_mutex_lock(&mem_map.mutex);
-
-	node = __mm_find_first(start, end);
-
-	if (node->start != start) {
-		ret = -1;
-		goto out;
-	}
-
-	while (node && node->end <= end) {
-		if (--node->refcnt == 0) {
-			ret = munlock((void *) node->start,
-				      node->end - node->start + 1);
-		}
-
-		if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
-			__mm_prev(node)->end = node->end;
-			tmp = __mm_prev(node);
-			__mm_remove(node);
-			node = tmp;
-		}
-
-		node = __mm_next(node);
-	}
-
-	if (node && node->refcnt == __mm_prev(node)->refcnt) {
-		__mm_prev(node)->end = node->end;
-		tmp = __mm_prev(node);
-		__mm_remove(node);
 	}
+}
 
-	if (node->end != end) {
-		ret = -1;
-		goto out;
+int ibv_dofork_range(void *base, size_t size)
+{
+	if (mm_root)
+		return ibv_madvise_range(base, size, MADV_DOFORK);
+	else {
+		too_late = 1;
+		return 0;
 	}
-
-out:
-	pthread_mutex_unlock(&mem_map.mutex);
-
-	return ret;
 }
Index: libibverbs/README
===================================================================
--- libibverbs/README	(revision 8793)
+++ libibverbs/README	(working copy)
@@ -101,12 +101,6 @@ necessary permissions to release your wo
 TODO
 ====
 
-1.0 series
-----------
-
- * Use the MADV_DONTFORK advice for madvise(2) to make applications
-   that use fork(2) work better.
-
 1.1 series
 ----------
 
Index: libmthca/configure.in
===================================================================
--- libmthca/configure.in	(revision 8793)
+++ libmthca/configure.in	(working copy)
@@ -26,7 +26,7 @@ AC_C_CONST
 AC_CHECK_SIZEOF(long)
 
 dnl Checks for library functions
-AC_CHECK_FUNCS(ibv_read_sysfs_file)
+AC_CHECK_FUNCS(ibv_read_sysfs_file ibv_dontfork_range ibv_dofork_range)
 
 AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script,
     if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then
Index: libmthca/src/memfree.c
===================================================================
--- libmthca/src/memfree.c	(revision 8793)
+++ libmthca/src/memfree.c	(working copy)
@@ -46,8 +46,8 @@
 #define MTHCA_FREE_MAP_SIZE (MTHCA_DB_REC_PER_PAGE / (SIZEOF_LONG * 8))
 
 struct mthca_db_page {
-	unsigned long free[MTHCA_FREE_MAP_SIZE];
-	uint64_t     *db_rec;
+	unsigned long		free[MTHCA_FREE_MAP_SIZE];
+	struct mthca_buf	db_rec;
 };
 
 struct mthca_db_table {
@@ -91,7 +91,7 @@ int mthca_alloc_db(struct mthca_db_table
 	}
 
 	for (i = start; i != end; i += dir)
-		if (db_tab->page[i].db_rec)
+		if (db_tab->page[i].db_rec.buf)
 			for (j = 0; j < MTHCA_FREE_MAP_SIZE; ++j)
 				if (db_tab->page[i].free[j])
 					goto found;
@@ -101,18 +101,14 @@ int mthca_alloc_db(struct mthca_db_table
 		goto out;
 	}
 
-	{
-		void *tmp;
-
-		if (posix_memalign(&tmp, MTHCA_DB_REC_PAGE_SIZE,
-				   MTHCA_DB_REC_PAGE_SIZE)) {
-			ret = -1;
-			goto out;
-		}
-		db_tab->page[i].db_rec = tmp;
+	if (mthca_alloc_buf(&db_tab->page[i].db_rec,
+			    MTHCA_DB_REC_PAGE_SIZE,
+			    MTHCA_DB_REC_PAGE_SIZE)) {
+		ret = -1;
+		goto out;
 	}
 
-	memset(db_tab->page[i].db_rec, 0, MTHCA_DB_REC_PAGE_SIZE);
+	memset(db_tab->page[i].db_rec.buf, 0, MTHCA_DB_REC_PAGE_SIZE);
 	memset(db_tab->page[i].free, 0xff, sizeof db_tab->page[i].free);
 
 	if (group == 0)
@@ -140,7 +136,7 @@ found:
 		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
 
 	ret = i * MTHCA_DB_REC_PER_PAGE + j;
-	*db = (uint32_t *) &db_tab->page[i].db_rec[j];
+	*db = db_tab->page[i].db_rec.buf + j * 8;
 
 out:
 	pthread_mutex_unlock(&db_tab->mutex);
@@ -163,7 +159,7 @@ void mthca_free_db(struct mthca_db_table
 	page = db_tab->page + i;
 
 	pthread_mutex_lock(&db_tab->mutex);
-	page->db_rec[j] = 0;
+	*(uint64_t *) (page->db_rec.buf + j * 8) = 0;
 
 	if (i >= db_tab->min_group2)
 		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
@@ -190,7 +186,7 @@ struct mthca_db_table *mthca_alloc_db_ta
 	db_tab->min_group2 = npages - 1;
 
 	for (i = 0; i < npages; ++i)
-		db_tab->page[i].db_rec = NULL;
+		db_tab->page[i].db_rec.buf = NULL;
 
 	return db_tab;
 }
@@ -203,8 +199,8 @@ void mthca_free_db_tab(struct mthca_db_t
 		return;
 
 	for (i = 0; i < db_tab->npages; ++i)
-		if (db_tab->page[i].db_rec)
-			free(db_tab->page[i].db_rec);
+		if (db_tab->page[i].db_rec.buf)
+			mthca_free_buf(&db_tab->page[i].db_rec);
 
 	free(db_tab);
 }
Index: libmthca/src/qp.c
===================================================================
--- libmthca/src/qp.c	(revision 8793)
+++ libmthca/src/qp.c	(working copy)
@@ -58,12 +58,12 @@ static const uint8_t mthca_opcode[] = {
 
 static void *get_recv_wqe(struct mthca_qp *qp, int n)
 {
-	return qp->buf + (n << qp->rq.wqe_shift);
+	return qp->buf.buf + (n << qp->rq.wqe_shift);
 }
 
 static void *get_send_wqe(struct mthca_qp *qp, int n)
 {
-	return qp->buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
+	return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
 }
 
 void mthca_init_qp_indices(struct mthca_qp *qp)
@@ -821,13 +821,14 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd
 
 	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
 
-	if (posix_memalign(&qp->buf, to_mdev(pd->context->device)->page_size,
-			   align(qp->buf_size, to_mdev(pd->context->device)->page_size))) {
+	if (mthca_alloc_buf(&qp->buf, 
+			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
+			    to_mdev(pd->context->device)->page_size)) {
 		free(qp->wrid);
 		return -1;
 	}
 
-	memset(qp->buf, 0, qp->buf_size);
+	memset(qp->buf.buf, 0, qp->buf_size);
 
 	if (mthca_is_memfree(pd->context)) {
 		struct mthca_next_seg *next;
Index: libmthca/src/verbs.c
===================================================================
--- libmthca/src/verbs.c	(revision 8793)
+++ libmthca/src/verbs.c	(working copy)
@@ -188,11 +188,10 @@ struct ibv_cq *mthca_create_cq(struct ib
 		goto err;
 
 	cqe = align_cq_size(cqe);
-	cq->buf = mthca_alloc_cq_buf(to_mdev(context->device), cqe);
-	if (!cq->buf)
+	if (mthca_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe))
 		goto err;
 
-	cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf,
+	cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf.buf,
 				cqe * MTHCA_CQ_ENTRY_SIZE,
 				0, IBV_ACCESS_LOCAL_WRITE);
 	if (!cq->mr)
@@ -251,7 +250,7 @@ err_unreg:
 	mthca_dereg_mr(cq->mr);
 
 err_buf:
-	free(cq->buf);
+	mthca_free_buf(&cq->buf);
 
 err:
 	free(cq);
@@ -264,7 +263,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	struct mthca_cq *cq = to_mcq(ibcq);
 	struct mthca_resize_cq cmd;
 	struct ibv_mr *mr;
-	void *buf;
+	struct mthca_buf buf;
 	int old_cqe;
 	int ret;
 
@@ -280,17 +279,15 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 		goto out;
 	}
 
-	buf = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), cqe);
-	if (!buf) {
-		ret = ENOMEM;
+	ret = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe);
+	if (ret)
 		goto out;
-	}
 
-	mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf,
+	mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf.buf,
 			    cqe * MTHCA_CQ_ENTRY_SIZE,
 			    0, IBV_ACCESS_LOCAL_WRITE);
 	if (!mr) {
-		free(buf);
+		mthca_free_buf(&buf);
 		ret = ENOMEM;
 		goto out;
 	}
@@ -303,14 +300,14 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
 	if (ret) {
 		mthca_dereg_mr(mr);
-		free(buf);
+		mthca_free_buf(&buf);
 		goto out;
 	}
 
-	mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
+	mthca_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
 
 	mthca_dereg_mr(cq->mr);
-	free(cq->buf);
+	mthca_free_buf(&cq->buf);
 
 	cq->buf = buf;
 	cq->mr  = mr;
@@ -336,8 +333,7 @@ int mthca_destroy_cq(struct ibv_cq *cq)
 	}
 
 	mthca_dereg_mr(to_mcq(cq)->mr);
-
-	free(to_mcq(cq)->buf);
+	mthca_free_buf(&to_mcq(cq)->buf);
 	free(to_mcq(cq));
 
 	return 0;
@@ -389,7 +385,7 @@ struct ibv_srq *mthca_create_srq(struct 
 	if (mthca_alloc_srq_buf(pd, &attr->attr, srq))
 		goto err;
 
-	srq->mr = __mthca_reg_mr(pd, srq->buf, srq->buf_size, 0, 0);
+	srq->mr = __mthca_reg_mr(pd, srq->buf.buf, srq->buf_size, 0, 0);
 	if (!srq->mr)
 		goto err_free;
 
@@ -430,7 +426,7 @@ err_unreg:
 
 err_free:
 	free(srq->wrid);
-	free(srq->buf);
+	mthca_free_buf(&srq->buf);
 
 err:
 	free(srq);
@@ -469,7 +465,7 @@ int mthca_destroy_srq(struct ibv_srq *sr
 
 	mthca_dereg_mr(to_msrq(srq)->mr);
 
-	free(to_msrq(srq)->buf);
+	mthca_free_buf(&to_msrq(srq)->buf);
 	free(to_msrq(srq)->wrid);
 	free(to_msrq(srq));
 
@@ -507,7 +503,7 @@ struct ibv_qp *mthca_create_qp(struct ib
 	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
 		goto err_free;
 
-	qp->mr = __mthca_reg_mr(pd, qp->buf, qp->buf_size, 0, 0);
+	qp->mr = __mthca_reg_mr(pd, qp->buf.buf, qp->buf_size, 0, 0);
 	if (!qp->mr)
 		goto err_free;
 
@@ -574,7 +570,7 @@ err_unreg:
 
 err_free:
 	free(qp->wrid);
-	free(qp->buf);
+	mthca_free_buf(&qp->buf);
 
 err:
 	free(qp);
@@ -655,8 +651,7 @@ int mthca_destroy_qp(struct ibv_qp *qp)
 	}
 
 	mthca_dereg_mr(to_mqp(qp)->mr);
-
-	free(to_mqp(qp)->buf);
+	mthca_free_buf(&to_mqp(qp)->buf);
 	free(to_mqp(qp)->wrid);
 	free(to_mqp(qp));
 
Index: libmthca/src/mthca.h
===================================================================
--- libmthca/src/mthca.h	(revision 8793)
+++ libmthca/src/mthca.h	(working copy)
@@ -112,6 +112,11 @@ struct mthca_context {
 	int		       qp_table_mask;
 };
 
+struct mthca_buf {
+	void		       *buf;
+	size_t			length;
+};
+
 struct mthca_pd {
 	struct ibv_pd         ibv_pd;
 	struct mthca_ah_page *ah_list;
@@ -121,7 +126,7 @@ struct mthca_pd {
 
 struct mthca_cq {
 	struct ibv_cq  	   ibv_cq;
-	void           	  *buf;
+	struct mthca_buf   buf;
 	pthread_spinlock_t lock;
 	struct ibv_mr  	  *mr;
 	uint32_t       	   cqn;
@@ -137,7 +142,7 @@ struct mthca_cq {
 
 struct mthca_srq {
 	struct ibv_srq     ibv_srq;
-	void              *buf;
+	struct mthca_buf   buf;
 	void           	  *last;
 	pthread_spinlock_t lock;
 	struct ibv_mr 	  *mr;
@@ -174,7 +179,7 @@ struct mthca_wq {
 
 struct mthca_qp {
 	struct ibv_qp    ibv_qp;
-	void            *buf;
+	struct mthca_buf buf;
 	uint64_t        *wrid;
 	int              send_wqe_offset;
 	int              max_inline_data;
@@ -259,6 +264,9 @@ static inline int mthca_is_memfree(struc
 	return to_mdev(ibctx->device)->hca_type == MTHCA_ARBEL;
 }
 
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
+void mthca_free_buf(struct mthca_buf *buf);
+
 int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
 		   uint32_t **db);
 void mthca_set_db_qn(uint32_t *db, enum mthca_db_type type, uint32_t qn);
@@ -290,7 +298,7 @@ void mthca_arbel_cq_event(struct ibv_cq 
 void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn,
 		    struct mthca_srq *srq);
 void mthca_cq_resize_copy_cqes(struct mthca_cq *cq, void *buf, int new_cqe);
-void *mthca_alloc_cq_buf(struct mthca_device *dev, int cqe);
+int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent);
 
 struct ibv_srq *mthca_create_srq(struct ibv_pd *pd,
 				 struct ibv_srq_init_attr *attr);
Index: libmthca/src/cq.c
===================================================================
--- libmthca/src/cq.c	(revision 8793)
+++ libmthca/src/cq.c	(working copy)
@@ -126,7 +126,7 @@ struct mthca_err_cqe {
 
 static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
 {
-	return cq->buf + entry * MTHCA_CQ_ENTRY_SIZE;
+	return cq->buf.buf + entry * MTHCA_CQ_ENTRY_SIZE;
 }
 
 static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
@@ -612,17 +612,16 @@ void mthca_cq_resize_copy_cqes(struct mt
 		       get_cqe(cq, i & old_cqe), MTHCA_CQ_ENTRY_SIZE);
 }
 
-void *mthca_alloc_cq_buf(struct mthca_device *dev, int nent)
+int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent)
 {
-	void *buf;
 	int i;
 
-	if (posix_memalign(&buf, dev->page_size,
-			   align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size)))
-		return NULL;
+	if (mthca_alloc_buf(buf, align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size),
+		    dev->page_size))
+		return -1;
 
 	for (i = 0; i < nent; ++i)
-		((struct mthca_cqe *) buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
+		((struct mthca_cqe *) buf->buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
 
-	return buf;
+	return 0;
 }
Index: libmthca/src/srq.c
===================================================================
--- libmthca/src/srq.c	(revision 8793)
+++ libmthca/src/srq.c	(working copy)
@@ -47,7 +47,7 @@
 
 static void *get_wqe(struct mthca_srq *srq, int n)
 {
-	return srq->buf + (n << srq->wqe_shift);
+	return srq->buf.buf + (n << srq->wqe_shift);
 }
 
 /*
@@ -292,13 +292,14 @@ int mthca_alloc_srq_buf(struct ibv_pd *p
 
 	srq->buf_size = srq->max << srq->wqe_shift;
 
-	if (posix_memalign(&srq->buf, to_mdev(pd->context->device)->page_size,
-			   align(srq->buf_size, to_mdev(pd->context->device)->page_size))) {
+	if (mthca_alloc_buf(&srq->buf,
+			    align(srq->buf_size, to_mdev(pd->context->device)->page_size),
+			    to_mdev(pd->context->device)->page_size)) {
 		free(srq->wrid);
 		return -1;
 	}
 
-	memset(srq->buf, 0, srq->buf_size);
+	memset(srq->buf.buf, 0, srq->buf_size);
 
 	/*
 	 * Now initialize the SRQ buffer so that all of the WQEs are
Index: libmthca/src/ah.c
===================================================================
--- libmthca/src/ah.c	(revision 8793)
+++ libmthca/src/ah.c	(working copy)
@@ -45,7 +45,7 @@
 
 struct mthca_ah_page {
 	struct mthca_ah_page *prev, *next;
-	void           	     *buf;
+	struct mthca_buf      buf;
 	struct ibv_mr 	     *mr;
 	int           	      use_cnt;
 	unsigned      	      free[0];
@@ -60,14 +60,14 @@ static struct mthca_ah_page *__add_page(
 	if (!page)
 		return NULL;
 
-	if (posix_memalign(&page->buf, page_size, page_size)) {
+	if (mthca_alloc_buf(&page->buf, page_size, page_size)) {
 		free(page);
 		return NULL;
 	}
 
-	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
+	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf.buf, page_size, 0);
 	if (!page->mr) {
-		free(page->buf);
+		mthca_free_buf(&page->buf);
 		free(page);
 		return NULL;
 	}
@@ -123,7 +123,7 @@ int mthca_alloc_av(struct mthca_pd *pd, 
 			if (page->free[i]) {
 				j = ffs(page->free[i]);
 				page->free[i] &= ~(1 << (j - 1));
-				ah->av = page->buf +
+				ah->av = page->buf.buf +
 					(i * 8 * sizeof (int) + (j - 1)) * sizeof *ah->av;
 				break;
 			}
@@ -172,7 +172,7 @@ void mthca_free_av(struct mthca_ah *ah)
 		pthread_mutex_lock(&pd->ah_mutex);
 
 		page = ah->page;
-		i = ((void *) ah->av - page->buf) / sizeof *ah->av;
+		i = ((void *) ah->av - page->buf.buf) / sizeof *ah->av;
 		page->free[i / (8 * sizeof (int))] |= 1 << (i % (8 * sizeof (int)));
 
 		if (!--page->use_cnt) {
@@ -184,7 +184,7 @@ void mthca_free_av(struct mthca_ah *ah)
 				page->next->prev = page->prev;
 
 			mthca_dereg_mr(page->mr);
-			free(page->buf);
+			mthca_free_buf(&page->buf);
 			free(page);
 		}
 
Index: libmthca/src/buf.c
===================================================================
--- libmthca/src/buf.c	(revision 0)
+++ libmthca/src/buf.c	(revision 0)
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+
+#include "mthca.h"
+
+#if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))
+
+/*
+ * If libibverbs isn't exporting these functions, then there's no
+ * point in doing it here, because the rest of libibverbs isn't going
+ * to be fork-safe anyway.
+ */
+static int ibv_dontfork_range(void *base, size_t size)
+{
+	return 0;
+}
+
+static int ibv_dofork_range(void *base, size_t size)
+{
+	return 0;
+}
+
+#endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */
+
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
+{
+	int ret;
+
+	ret = posix_memalign(&buf->buf, page_size, align(size, page_size));
+	if (ret)
+		return ret;
+
+	ret = ibv_dontfork_range(buf->buf, size);
+	if (ret)
+		free(buf->buf);
+
+	if (!ret)
+		buf->length = size;
+
+	return ret;
+}
+
+void mthca_free_buf(struct mthca_buf *buf)
+{
+	ibv_dofork_range(buf->buf, buf->length);
+	free(buf->buf);
+}
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+
+#include "mthca.h"
+
+#if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))
+
+/*
+ * If libibverbs isn't exporting these functions, then there's no
+ * point in doing it here, because the rest of libibverbs isn't going
+ * to be fork-safe anyway.
+ */
+static int ibv_dontfork_range(void *base, size_t size)
+{
+	return 0;
+}
+
+static int ibv_dofork_range(void *base, size_t size)
+{
+	return 0;
+}
+
+#endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */
+
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
+{
+	int ret;
+
+	ret = posix_memalign(&buf->buf, page_size, align(size, page_size));
+	if (ret)
+		return ret;
+
+	ret = ibv_dontfork_range(buf->buf, size);
+	if (ret)
+		free(buf->buf);
+
+	if (!ret)
+		buf->length = size;
+
+	return ret;
+}
+
+void mthca_free_buf(struct mthca_buf *buf)
+{
+	ibv_dofork_range(buf->buf, buf->length);
+	free(buf->buf);
+}
Index: libmthca/ChangeLog
===================================================================
--- libmthca/ChangeLog	(revision 8793)
+++ libmthca/ChangeLog	(working copy)
@@ -1,3 +1,19 @@
+2006-07-26  Roland Dreier  <rdreier at cisco.com>
+
+	* src/mthca.h, src/ah.c, src/cq.c, src/memfree.c, src/qp.c,
+	src/srq.c, src/verbs.c: Convert internal allocations for AH pages
+	(for non-memfree HCAs), CQ buffers, doorbell pages (for memfree
+	HCAs), QP buffers and SRQ buffers to use the new buffer
+	allocator.  This makes libmthca fork()-clean when built against
+	libibverbs 1.1.
+
+	* src/buf.c (mthca_alloc_buf, mthca_free_buf): Add new functions
+	to wrap up allocating page-aligned buffers.  The new functions
+	will call ibv_dontfork_range()/ibv_dofork_range() to do proper
+	madvise()ing to handle fork(), if applicable.
+
+	* configure.in: Check for ibv_dontfork_range() and ibv_dontfork_range().
+
 2006-07-04  Dotan Barak  <dotanb at mellanox.co.il>
 
 	* src/verbs.c (mthca_create_cq, mthca_resize_cq): Passing huge
Index: libmthca/Makefile.am
===================================================================
--- libmthca/Makefile.am	(revision 8793)
+++ libmthca/Makefile.am	(working copy)
@@ -12,10 +12,9 @@ else
     mthca_version_script =
 endif
 
-src_mthca_la_SOURCES = src/ah.c src/cq.c src/memfree.c src/mthca.c src/qp.c \
-    src/srq.c src/verbs.c
-src_mthca_la_LDFLAGS = -avoid-version -module \
-    $(mthca_version_script)
+src_mthca_la_SOURCES = src/ah.c src/buf.c src/cq.c src/memfree.c src/mthca.c \
+    src/qp.c src/srq.c src/verbs.c
+src_mthca_la_LDFLAGS = -avoid-version -module $(mthca_version_script)
 
 DEBIAN = debian/changelog debian/compat debian/control debian/copyright \
     debian/libmthca1.install debian/libmthca-dev.install debian/rules




More information about the general mailing list