[openib-general] [PATCH/RFC] libibverbs and libmthca fork support
Roland Dreier
rdreier at cisco.com
Tue Aug 1 07:21:15 PDT 2006
> You forgot to include buf.c in the patch.
Oops, forgot to do svn add before generating the diff. Updated diff below:
Index: libibverbs/include/infiniband/driver.h
===================================================================
--- libibverbs/include/infiniband/driver.h (revision 8793)
+++ libibverbs/include/infiniband/driver.h (working copy)
@@ -135,6 +135,9 @@ int ibv_cmd_destroy_ah(struct ibv_ah *ah
int ibv_cmd_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
int ibv_cmd_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
+int ibv_dontfork_range(void *base, size_t size);
+int ibv_dofork_range(void *base, size_t size);
+
/*
* sysfs helper functions
*/
Index: libibverbs/include/infiniband/verbs.h
===================================================================
--- libibverbs/include/infiniband/verbs.h (revision 8793)
+++ libibverbs/include/infiniband/verbs.h (working copy)
@@ -285,6 +285,8 @@ struct ibv_pd {
struct ibv_mr {
struct ibv_context *context;
struct ibv_pd *pd;
+ void *addr;
+ size_t length;
uint32_t handle;
uint32_t lkey;
uint32_t rkey;
@@ -1016,6 +1018,14 @@ int ibv_attach_mcast(struct ibv_qp *qp,
*/
int ibv_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
+/**
+ * ibv_fork_init - Prepare data structures so that fork() may be used
+ * safely. If this function is not called or returns a non-zero
+ * status, then libibverbs data structures are not fork()-safe and the
+ * effect of an application calling fork() is undefined.
+ */
+int ibv_fork_init(void);
+
END_C_DECLS
# undef __attribute_const
Index: libibverbs/ChangeLog
===================================================================
--- libibverbs/ChangeLog (revision 8793)
+++ libibverbs/ChangeLog (working copy)
@@ -1,3 +1,29 @@
+2006-07-26 Roland Dreier <rdreier at cisco.com>
+
+ * src/verbs.c (ibv_reg_mr, ibv_dereg_mr): Add calls to
+ ibv_dontfork_range() and ibv_dofork_range() for memory regions
+ registered by library consumers.
+
+ * include/infiniband/verbs.h: Add declaration of ibv_fork_init().
+
+ * include/infiniband/driver.h: Add declarations of
+ ibv_dontfork_range() and ibv_dofork_range().
+
+ * src/memory.c: Rewrite to use a red-black tree instead of a
+ linked list. Change from doing mlock()/munlock() to
+ madvise(..., MADV_DONTFORK) and madvise(..., MADV_DOFORK), and
+ change the name of the entry points to ibv_dontfork_range() and
+ ibv_dofork_range(). Add ibv_fork_init() for applications to
+ request fork-safe behavior.
+
+ * src/ibverbs.h: Kill off unused declarations.
+
+ * src/init.c (ibverbs_init): Get rid of call to ibv_init_mem_map().
+
+ * include/infiniband/verbs.h: Add addr and length field to struct
+ ibv_mr so that memory regions can be madvised(). This changes the
+ ABI, since the layout of struct ibv_mr is changed.
+
2006-07-04 Roland Dreier <rdreier at cisco.com>
* include/infiniband/arch.h: Fix typo in sparc mb()
Index: libibverbs/src/libibverbs.map
===================================================================
--- libibverbs/src/libibverbs.map (revision 8793)
+++ libibverbs/src/libibverbs.map (working copy)
@@ -74,6 +74,9 @@ IBVERBS_1.0 {
mult_to_ibv_rate;
ibv_get_sysfs_path;
ibv_read_sysfs_file;
+ ibv_fork_init;
+ ibv_dontfork_range;
+ ibv_dofork_range;
local: *;
};
Index: libibverbs/src/ibverbs.h
===================================================================
--- libibverbs/src/ibverbs.h (revision 8793)
+++ libibverbs/src/ibverbs.h (working copy)
@@ -58,11 +58,7 @@ struct ibv_abi_compat_v2 {
extern HIDDEN int abi_ver;
-extern HIDDEN int ibverbs_init(struct ibv_device ***list);
-
-extern HIDDEN int ibv_init_mem_map(void);
-extern HIDDEN int ibv_lock_range(void *base, size_t size);
-extern HIDDEN int ibv_unlock_range(void *base, size_t size);
+HIDDEN int ibverbs_init(struct ibv_device ***list);
#define IBV_INIT_CMD(cmd, size, opcode) \
do { \
Index: libibverbs/src/verbs.c
===================================================================
--- libibverbs/src/verbs.c (revision 8793)
+++ libibverbs/src/verbs.c (working copy)
@@ -155,18 +155,32 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd
{
struct ibv_mr *mr;
+ if (ibv_dontfork_range(addr, length))
+ return NULL;
+
mr = pd->context->ops.reg_mr(pd, addr, length, access);
if (mr) {
mr->context = pd->context;
mr->pd = pd;
- }
+ mr->addr = addr;
+ mr->length = length;
+ } else
+ ibv_dofork_range(addr, length);
return mr;
}
int ibv_dereg_mr(struct ibv_mr *mr)
{
- return mr->context->ops.dereg_mr(mr);
+ int ret;
+ void *addr = mr->addr;
+ size_t length = mr->length;
+
+ ret = mr->context->ops.dereg_mr(mr);
+ if (!ret)
+ ibv_dofork_range(addr, length);
+
+ return ret;
}
static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
Index: libibverbs/src/init.c
===================================================================
--- libibverbs/src/init.c (revision 8793)
+++ libibverbs/src/init.c (working copy)
@@ -205,9 +205,6 @@ HIDDEN int ibverbs_init(struct ibv_devic
*list = NULL;
- if (ibv_init_mem_map())
- return 0;
-
find_drivers(default_path);
/*
Index: libibverbs/src/memory.c
===================================================================
--- libibverbs/src/memory.c (revision 8793)
+++ libibverbs/src/memory.c (working copy)
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -36,6 +37,7 @@
# include <config.h>
#endif /* HAVE_CONFIG_H */
+#include <errno.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
@@ -44,114 +46,424 @@
#include "ibverbs.h"
/*
- * We keep a linked list of page ranges that have been locked along with a
- * reference count to manage overlapping registrations, etc.
- *
- * Eventually we should turn this into an RB-tree or something similar
- * to avoid the O(n) cost of registering/unregistering memory.
+ * Most distro's headers don't have these yet.
*/
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
struct ibv_mem_node {
- struct ibv_mem_node *prev, *next;
- uintptr_t start, end;
- int refcnt;
+ enum {
+ IBV_RED,
+ IBV_BLACK
+ } color;
+ struct ibv_mem_node *parent;
+ struct ibv_mem_node *left, *right;
+ uintptr_t start, end;
+ int refcnt;
};
-static struct {
- struct ibv_mem_node *first;
- pthread_mutex_t mutex;
- uintptr_t page_size;
-} mem_map;
+static struct ibv_mem_node *mm_root;
+static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
+static int page_size;
+static int too_late;
-int ibv_init_mem_map(void)
+int ibv_fork_init(void)
{
- struct ibv_mem_node *node = NULL;
-
- node = malloc(sizeof *node);
- if (!node)
- goto fail;
-
- node->prev = node->next = NULL;
- node->start = 0;
- node->end = UINTPTR_MAX;
- node->refcnt = 0;
+ void *tmp;
- mem_map.first = node;
+ if (mm_root)
+ return 0;
- mem_map.page_size = sysconf(_SC_PAGESIZE);
- if (mem_map.page_size < 0)
- goto fail;
+ if (too_late)
+ return EINVAL;
- if (pthread_mutex_init(&mem_map.mutex, NULL))
- goto fail;
+ page_size = sysconf(_SC_PAGESIZE);
+ if (page_size < 0)
+ return errno;
+
+ if (posix_memalign(&tmp, page_size, page_size))
+ return ENOMEM;
+
+ if (madvise(tmp, page_size, MADV_DONTFORK) ||
+ madvise(tmp, page_size, MADV_DOFORK))
+ return ENOSYS;
+
+ free(tmp);
+
+ mm_root = malloc(sizeof *mm_root);
+ if (!mm_root)
+ return ENOMEM;
+
+ mm_root->parent = NULL;
+ mm_root->left = NULL;
+ mm_root->right = NULL;
+ mm_root->color = IBV_BLACK;
+ mm_root->start = 0;
+ mm_root->end = UINTPTR_MAX;
+ mm_root->refcnt = 0;
return 0;
+}
-fail:
- if (node)
- free(node);
+static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+{
+ if (node->left) {
+ node = node->left;
+ while (node->right)
+ node = node->right;
+ } else {
+ while (node->parent && node == node->parent->left)
+ node = node->parent;
- return -1;
+ node = node->parent;
+ }
+
+ return node;
}
-static struct ibv_mem_node *__mm_find_first(uintptr_t start, uintptr_t end)
+static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
{
- struct ibv_mem_node *node = mem_map.first;
+ if (node->right) {
+ node = node->right;
+ while (node->left)
+ node = node->left;
+ } else {
+ while (node->parent && node == node->parent->right)
+ node = node->parent;
- while (node) {
- if ((node->start <= start && node->end >= start) ||
- (node->start <= end && node->end >= end))
- break;
- node = node->next;
+ node = node->parent;
}
return node;
}
-static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+static void __mm_rotate_right(struct ibv_mem_node *node)
{
- return node->prev;
+ struct ibv_mem_node *tmp;
+
+ tmp = node->left;
+
+ node->left = tmp->right;
+ if (node->left)
+ node->left->parent = node;
+
+ if (node->parent) {
+ if (node->parent->right == node)
+ node->parent->right = tmp;
+ else
+ node->parent->left = tmp;
+ } else
+ mm_root = tmp;
+
+ tmp->parent = node->parent;
+
+ tmp->right = node;
+ node->parent = tmp;
}
-static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
+static void __mm_rotate_left(struct ibv_mem_node *node)
+{
+ struct ibv_mem_node *tmp;
+
+ tmp = node->right;
+
+ node->right = tmp->left;
+ if (node->right)
+ node->right->parent = node;
+
+ if (node->parent) {
+ if (node->parent->right == node)
+ node->parent->right = tmp;
+ else
+ node->parent->left = tmp;
+ } else
+ mm_root = tmp;
+
+ tmp->parent = node->parent;
+
+ tmp->left = node;
+ node->parent = tmp;
+}
+
+static int verify(struct ibv_mem_node *node)
+{
+ int hl, hr;
+
+ if (!node)
+ return 1;
+
+ hl = verify(node->left);
+ hr = verify(node->left);
+
+ if (!hl || !hr)
+ return 0;
+ if (hl != hr)
+ return 0;
+
+ if (node->color == IBV_RED) {
+ if (node->left && node->left->color != IBV_BLACK)
+ return 0;
+ if (node->right && node->right->color != IBV_BLACK)
+ return 0;
+ return hl;
+ }
+
+ return hl + 1;
+}
+
+static void __mm_add_rebalance(struct ibv_mem_node *node)
{
- return node->next;
+ struct ibv_mem_node *parent, *gp, *uncle;
+
+ while (node->parent && node->parent->color == IBV_RED) {
+ parent = node->parent;
+ gp = node->parent->parent;
+
+ if (parent == gp->left) {
+ uncle = gp->right;
+
+ if (uncle && uncle->color == IBV_RED) {
+ parent->color = IBV_BLACK;
+ uncle->color = IBV_BLACK;
+ gp->color = IBV_RED;
+
+ node = gp;
+ } else {
+ if (node == parent->right) {
+ __mm_rotate_left(parent);
+ node = parent;
+ parent = node->parent;
+ }
+
+ parent->color = IBV_BLACK;
+ gp->color = IBV_RED;
+
+ __mm_rotate_right(gp);
+ }
+ } else {
+ uncle = gp->left;
+
+ if (uncle && uncle->color == IBV_RED) {
+ parent->color = IBV_BLACK;
+ uncle->color = IBV_BLACK;
+ gp->color = IBV_RED;
+
+ node = gp;
+ } else {
+ if (node == parent->left) {
+ __mm_rotate_right(parent);
+ node = parent;
+ parent = node->parent;
+ }
+
+ parent->color = IBV_BLACK;
+ gp->color = IBV_RED;
+
+ __mm_rotate_left(gp);
+ }
+ }
+ }
+
+ mm_root->color = IBV_BLACK;
}
-static void __mm_add(struct ibv_mem_node *node,
- struct ibv_mem_node *new)
+static void __mm_add(struct ibv_mem_node *new)
{
- new->prev = node;
- new->next = node->next;
- node->next = new;
- if (new->next)
- new->next->prev = new;
+ struct ibv_mem_node *node, *parent = NULL;
+
+ node = mm_root;
+ while (node) {
+ parent = node;
+ if (node->start < new->start)
+ node = node->right;
+ else
+ node = node->left;
+ }
+
+ if (parent->start < new->start)
+ parent->right = new;
+ else
+ parent->left = new;
+
+ new->parent = parent;
+ new->left = NULL;
+ new->right = NULL;
+
+ new->color = IBV_RED;
+ __mm_add_rebalance(new);
}
static void __mm_remove(struct ibv_mem_node *node)
{
- /* Never have to remove the first node, so we can use prev */
- node->prev->next = node->next;
- if (node->next)
- node->next->prev = node->prev;
+ struct ibv_mem_node *child, *parent, *sib, *tmp;
+ int nodecol;
+
+ if (node->left && node->right) {
+ tmp = node->left;
+ while (tmp->right)
+ tmp = tmp->right;
+
+ nodecol = tmp->color;
+ child = tmp->left;
+ tmp->color = node->color;
+
+ if (tmp->parent != node) {
+ parent = tmp->parent;
+ parent->right = tmp->left;
+ if (tmp->left)
+ tmp->left->parent = parent;
+
+ tmp->left = node->left;
+ node->left->parent = tmp;
+ } else
+ parent = tmp;
+
+ tmp->right = node->right;
+ node->right->parent = tmp;
+
+ tmp->parent = node->parent;
+ if (node->parent) {
+ if (node->parent->left == node)
+ node->parent->left = tmp;
+ else
+ node->parent->right = tmp;
+ } else
+ mm_root = tmp;
+ } else {
+ nodecol = node->color;
+
+ child = node->left ? node->left : node->right;
+ parent = node->parent;
+
+ if (child)
+ child->parent = parent;
+ if (parent) {
+ if (parent->left == node)
+ parent->left = child;
+ else
+ parent->right = child;
+ } else
+ mm_root = child;
+ }
+
+ free(node);
+
+ if (nodecol == IBV_RED)
+ return;
+
+ while ((!child || child->color == IBV_BLACK) && child != mm_root) {
+ if (parent->left == child) {
+ sib = parent->right;
+
+ if (sib->color == IBV_RED) {
+ parent->color = IBV_RED;
+ sib->color = IBV_BLACK;
+ __mm_rotate_left(parent);
+ sib = parent->right;
+ }
+
+ if ((!sib->left || sib->left->color == IBV_BLACK) &&
+ (!sib->right || sib->right->color == IBV_BLACK)) {
+ sib->color = IBV_RED;
+ child = parent;
+ parent = child->parent;
+ } else {
+ if (!sib->right || sib->right->color == IBV_BLACK) {
+ if (sib->left)
+ sib->left->color = IBV_BLACK;
+ sib->color = IBV_RED;
+ __mm_rotate_right(sib);
+ sib = parent->right;
+ }
+
+ sib->color = parent->color;
+ parent->color = IBV_BLACK;
+ if (sib->right)
+ sib->right->color = IBV_BLACK;
+ __mm_rotate_left(parent);
+ child = mm_root;
+ break;
+ }
+ } else {
+ sib = parent->left;
+
+ if (sib->color == IBV_RED) {
+ parent->color = IBV_RED;
+ sib->color = IBV_BLACK;
+ __mm_rotate_right(parent);
+ sib = parent->left;
+ }
+
+ if ((!sib->left || sib->left->color == IBV_BLACK) &&
+ (!sib->right || sib->right->color == IBV_BLACK)) {
+ sib->color = IBV_RED;
+ child = parent;
+ parent = child->parent;
+ } else {
+ if (!sib->left || sib->left->color == IBV_BLACK) {
+ if (sib->right)
+ sib->right->color = IBV_BLACK;
+ sib->color = IBV_RED;
+ __mm_rotate_left(sib);
+ sib = parent->left;
+ }
+
+ sib->color = parent->color;
+ parent->color = IBV_BLACK;
+ if (sib->left)
+ sib->left->color = IBV_BLACK;
+ __mm_rotate_right(parent);
+ child = mm_root;
+ break;
+ }
+ }
+ }
+
+ if (child)
+ child->color = IBV_BLACK;
+}
+
+static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
+{
+ struct ibv_mem_node *node = mm_root;
+
+ while (node) {
+ if (node->start <= start && node->end >= start)
+ break;
+
+ if (node->start < start)
+ node = node->right;
+ else
+ node = node->left;
+ }
+
+ return node;
}
-int ibv_lock_range(void *base, size_t size)
+static int ibv_madvise_range(void *base, size_t size, int advice)
{
uintptr_t start, end;
struct ibv_mem_node *node, *tmp;
+ int inc;
int ret = 0;
if (!size)
return 0;
- start = (uintptr_t) base & ~(mem_map.page_size - 1);
- end = ((uintptr_t) (base + size + mem_map.page_size - 1) &
- ~(mem_map.page_size - 1)) - 1;
+ inc = advice == MADV_DONTFORK ? 1 : -1;
+
+ start = (uintptr_t) base & ~(page_size - 1);
+ end = ((uintptr_t) (base + size + page_size - 1) &
+ ~(page_size - 1)) - 1;
- pthread_mutex_lock(&mem_map.mutex);
+ pthread_mutex_lock(&mm_mutex);
- node = __mm_find_first(start, end);
+ node = __mm_find_start(start, end);
if (node->start < start) {
tmp = malloc(sizeof *tmp);
@@ -165,11 +477,19 @@ int ibv_lock_range(void *base, size_t si
tmp->refcnt = node->refcnt;
node->end = start - 1;
- __mm_add(node, tmp);
+ __mm_add(tmp);
node = tmp;
+ } else {
+ tmp = __mm_prev(node);
+ if (tmp && tmp->refcnt == node->refcnt + inc) {
+ tmp->end = node->end;
+ tmp->refcnt = node->refcnt;
+ __mm_remove(node);
+ node = tmp;
+ }
}
- while (node->start <= end) {
+ while (node && node->start <= end) {
if (node->end > end) {
tmp = malloc(sizeof *tmp);
if (!tmp) {
@@ -182,13 +502,16 @@ int ibv_lock_range(void *base, size_t si
tmp->refcnt = node->refcnt;
node->end = end;
- __mm_add(node, tmp);
+ __mm_add(tmp);
}
+ node->refcnt += inc;
- if (node->refcnt++ == 0) {
- ret = mlock((void *) node->start,
- node->end - node->start + 1);
+ if ((inc == -1 && node->refcnt == 0) ||
+ (inc == 1 && node->refcnt == 1)) {
+ ret = madvise((void *) node->start,
+ node->end - node->start + 1,
+ advice);
if (ret)
goto out;
}
@@ -196,63 +519,36 @@ int ibv_lock_range(void *base, size_t si
node = __mm_next(node);
}
+ if (node) {
+ tmp = __mm_prev(node);
+ if (tmp && node->refcnt == tmp->refcnt) {
+ tmp->end = node->end;
+ __mm_remove(node);
+ }
+ }
+
out:
- pthread_mutex_unlock(&mem_map.mutex);
+ pthread_mutex_unlock(&mm_mutex);
return ret;
}
-int ibv_unlock_range(void *base, size_t size)
+int ibv_dontfork_range(void *base, size_t size)
{
- uintptr_t start, end;
- struct ibv_mem_node *node, *tmp;
- int ret = 0;
-
- if (!size)
+ if (mm_root)
+ return ibv_madvise_range(base, size, MADV_DONTFORK);
+ else {
+ too_late = 1;
return 0;
-
- start = (uintptr_t) base & ~(mem_map.page_size - 1);
- end = ((uintptr_t) (base + size + mem_map.page_size - 1) &
- ~(mem_map.page_size - 1)) - 1;
-
- pthread_mutex_lock(&mem_map.mutex);
-
- node = __mm_find_first(start, end);
-
- if (node->start != start) {
- ret = -1;
- goto out;
- }
-
- while (node && node->end <= end) {
- if (--node->refcnt == 0) {
- ret = munlock((void *) node->start,
- node->end - node->start + 1);
- }
-
- if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
- __mm_prev(node)->end = node->end;
- tmp = __mm_prev(node);
- __mm_remove(node);
- node = tmp;
- }
-
- node = __mm_next(node);
- }
-
- if (node && node->refcnt == __mm_prev(node)->refcnt) {
- __mm_prev(node)->end = node->end;
- tmp = __mm_prev(node);
- __mm_remove(node);
}
+}
- if (node->end != end) {
- ret = -1;
- goto out;
+int ibv_dofork_range(void *base, size_t size)
+{
+ if (mm_root)
+ return ibv_madvise_range(base, size, MADV_DOFORK);
+ else {
+ too_late = 1;
+ return 0;
}
-
-out:
- pthread_mutex_unlock(&mem_map.mutex);
-
- return ret;
}
Index: libibverbs/README
===================================================================
--- libibverbs/README (revision 8793)
+++ libibverbs/README (working copy)
@@ -101,12 +101,6 @@ necessary permissions to release your wo
TODO
====
-1.0 series
-----------
-
- * Use the MADV_DONTFORK advice for madvise(2) to make applications
- that use fork(2) work better.
-
1.1 series
----------
Index: libmthca/configure.in
===================================================================
--- libmthca/configure.in (revision 8793)
+++ libmthca/configure.in (working copy)
@@ -26,7 +26,7 @@ AC_C_CONST
AC_CHECK_SIZEOF(long)
dnl Checks for library functions
-AC_CHECK_FUNCS(ibv_read_sysfs_file)
+AC_CHECK_FUNCS(ibv_read_sysfs_file ibv_dontfork_range ibv_dofork_range)
AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script,
if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then
Index: libmthca/src/memfree.c
===================================================================
--- libmthca/src/memfree.c (revision 8793)
+++ libmthca/src/memfree.c (working copy)
@@ -46,8 +46,8 @@
#define MTHCA_FREE_MAP_SIZE (MTHCA_DB_REC_PER_PAGE / (SIZEOF_LONG * 8))
struct mthca_db_page {
- unsigned long free[MTHCA_FREE_MAP_SIZE];
- uint64_t *db_rec;
+ unsigned long free[MTHCA_FREE_MAP_SIZE];
+ struct mthca_buf db_rec;
};
struct mthca_db_table {
@@ -91,7 +91,7 @@ int mthca_alloc_db(struct mthca_db_table
}
for (i = start; i != end; i += dir)
- if (db_tab->page[i].db_rec)
+ if (db_tab->page[i].db_rec.buf)
for (j = 0; j < MTHCA_FREE_MAP_SIZE; ++j)
if (db_tab->page[i].free[j])
goto found;
@@ -101,18 +101,14 @@ int mthca_alloc_db(struct mthca_db_table
goto out;
}
- {
- void *tmp;
-
- if (posix_memalign(&tmp, MTHCA_DB_REC_PAGE_SIZE,
- MTHCA_DB_REC_PAGE_SIZE)) {
- ret = -1;
- goto out;
- }
- db_tab->page[i].db_rec = tmp;
+ if (mthca_alloc_buf(&db_tab->page[i].db_rec,
+ MTHCA_DB_REC_PAGE_SIZE,
+ MTHCA_DB_REC_PAGE_SIZE)) {
+ ret = -1;
+ goto out;
}
- memset(db_tab->page[i].db_rec, 0, MTHCA_DB_REC_PAGE_SIZE);
+ memset(db_tab->page[i].db_rec.buf, 0, MTHCA_DB_REC_PAGE_SIZE);
memset(db_tab->page[i].free, 0xff, sizeof db_tab->page[i].free);
if (group == 0)
@@ -140,7 +136,7 @@ found:
j = MTHCA_DB_REC_PER_PAGE - 1 - j;
ret = i * MTHCA_DB_REC_PER_PAGE + j;
- *db = (uint32_t *) &db_tab->page[i].db_rec[j];
+ *db = db_tab->page[i].db_rec.buf + j * 8;
out:
pthread_mutex_unlock(&db_tab->mutex);
@@ -163,7 +159,7 @@ void mthca_free_db(struct mthca_db_table
page = db_tab->page + i;
pthread_mutex_lock(&db_tab->mutex);
- page->db_rec[j] = 0;
+ *(uint64_t *) (page->db_rec.buf + j * 8) = 0;
if (i >= db_tab->min_group2)
j = MTHCA_DB_REC_PER_PAGE - 1 - j;
@@ -190,7 +186,7 @@ struct mthca_db_table *mthca_alloc_db_ta
db_tab->min_group2 = npages - 1;
for (i = 0; i < npages; ++i)
- db_tab->page[i].db_rec = NULL;
+ db_tab->page[i].db_rec.buf = NULL;
return db_tab;
}
@@ -203,8 +199,8 @@ void mthca_free_db_tab(struct mthca_db_t
return;
for (i = 0; i < db_tab->npages; ++i)
- if (db_tab->page[i].db_rec)
- free(db_tab->page[i].db_rec);
+ if (db_tab->page[i].db_rec.buf)
+ mthca_free_buf(&db_tab->page[i].db_rec);
free(db_tab);
}
Index: libmthca/src/qp.c
===================================================================
--- libmthca/src/qp.c (revision 8793)
+++ libmthca/src/qp.c (working copy)
@@ -58,12 +58,12 @@ static const uint8_t mthca_opcode[] = {
static void *get_recv_wqe(struct mthca_qp *qp, int n)
{
- return qp->buf + (n << qp->rq.wqe_shift);
+ return qp->buf.buf + (n << qp->rq.wqe_shift);
}
static void *get_send_wqe(struct mthca_qp *qp, int n)
{
- return qp->buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
+ return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
}
void mthca_init_qp_indices(struct mthca_qp *qp)
@@ -821,13 +821,14 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd
qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
- if (posix_memalign(&qp->buf, to_mdev(pd->context->device)->page_size,
- align(qp->buf_size, to_mdev(pd->context->device)->page_size))) {
+ if (mthca_alloc_buf(&qp->buf,
+ align(qp->buf_size, to_mdev(pd->context->device)->page_size),
+ to_mdev(pd->context->device)->page_size)) {
free(qp->wrid);
return -1;
}
- memset(qp->buf, 0, qp->buf_size);
+ memset(qp->buf.buf, 0, qp->buf_size);
if (mthca_is_memfree(pd->context)) {
struct mthca_next_seg *next;
Index: libmthca/src/verbs.c
===================================================================
--- libmthca/src/verbs.c (revision 8793)
+++ libmthca/src/verbs.c (working copy)
@@ -188,11 +188,10 @@ struct ibv_cq *mthca_create_cq(struct ib
goto err;
cqe = align_cq_size(cqe);
- cq->buf = mthca_alloc_cq_buf(to_mdev(context->device), cqe);
- if (!cq->buf)
+ if (mthca_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe))
goto err;
- cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf,
+ cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf.buf,
cqe * MTHCA_CQ_ENTRY_SIZE,
0, IBV_ACCESS_LOCAL_WRITE);
if (!cq->mr)
@@ -251,7 +250,7 @@ err_unreg:
mthca_dereg_mr(cq->mr);
err_buf:
- free(cq->buf);
+ mthca_free_buf(&cq->buf);
err:
free(cq);
@@ -264,7 +263,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
struct mthca_cq *cq = to_mcq(ibcq);
struct mthca_resize_cq cmd;
struct ibv_mr *mr;
- void *buf;
+ struct mthca_buf buf;
int old_cqe;
int ret;
@@ -280,17 +279,15 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
goto out;
}
- buf = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), cqe);
- if (!buf) {
- ret = ENOMEM;
+ ret = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe);
+ if (ret)
goto out;
- }
- mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf,
+ mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf.buf,
cqe * MTHCA_CQ_ENTRY_SIZE,
0, IBV_ACCESS_LOCAL_WRITE);
if (!mr) {
- free(buf);
+ mthca_free_buf(&buf);
ret = ENOMEM;
goto out;
}
@@ -303,14 +300,14 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
if (ret) {
mthca_dereg_mr(mr);
- free(buf);
+ mthca_free_buf(&buf);
goto out;
}
- mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
+ mthca_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
mthca_dereg_mr(cq->mr);
- free(cq->buf);
+ mthca_free_buf(&cq->buf);
cq->buf = buf;
cq->mr = mr;
@@ -336,8 +333,7 @@ int mthca_destroy_cq(struct ibv_cq *cq)
}
mthca_dereg_mr(to_mcq(cq)->mr);
-
- free(to_mcq(cq)->buf);
+ mthca_free_buf(&to_mcq(cq)->buf);
free(to_mcq(cq));
return 0;
@@ -389,7 +385,7 @@ struct ibv_srq *mthca_create_srq(struct
if (mthca_alloc_srq_buf(pd, &attr->attr, srq))
goto err;
- srq->mr = __mthca_reg_mr(pd, srq->buf, srq->buf_size, 0, 0);
+ srq->mr = __mthca_reg_mr(pd, srq->buf.buf, srq->buf_size, 0, 0);
if (!srq->mr)
goto err_free;
@@ -430,7 +426,7 @@ err_unreg:
err_free:
free(srq->wrid);
- free(srq->buf);
+ mthca_free_buf(&srq->buf);
err:
free(srq);
@@ -469,7 +465,7 @@ int mthca_destroy_srq(struct ibv_srq *sr
mthca_dereg_mr(to_msrq(srq)->mr);
- free(to_msrq(srq)->buf);
+ mthca_free_buf(&to_msrq(srq)->buf);
free(to_msrq(srq)->wrid);
free(to_msrq(srq));
@@ -507,7 +503,7 @@ struct ibv_qp *mthca_create_qp(struct ib
pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
goto err_free;
- qp->mr = __mthca_reg_mr(pd, qp->buf, qp->buf_size, 0, 0);
+ qp->mr = __mthca_reg_mr(pd, qp->buf.buf, qp->buf_size, 0, 0);
if (!qp->mr)
goto err_free;
@@ -574,7 +570,7 @@ err_unreg:
err_free:
free(qp->wrid);
- free(qp->buf);
+ mthca_free_buf(&qp->buf);
err:
free(qp);
@@ -655,8 +651,7 @@ int mthca_destroy_qp(struct ibv_qp *qp)
}
mthca_dereg_mr(to_mqp(qp)->mr);
-
- free(to_mqp(qp)->buf);
+ mthca_free_buf(&to_mqp(qp)->buf);
free(to_mqp(qp)->wrid);
free(to_mqp(qp));
Index: libmthca/src/mthca.h
===================================================================
--- libmthca/src/mthca.h (revision 8793)
+++ libmthca/src/mthca.h (working copy)
@@ -112,6 +112,11 @@ struct mthca_context {
int qp_table_mask;
};
+struct mthca_buf {
+ void *buf;
+ size_t length;
+};
+
struct mthca_pd {
struct ibv_pd ibv_pd;
struct mthca_ah_page *ah_list;
@@ -121,7 +126,7 @@ struct mthca_pd {
struct mthca_cq {
struct ibv_cq ibv_cq;
- void *buf;
+ struct mthca_buf buf;
pthread_spinlock_t lock;
struct ibv_mr *mr;
uint32_t cqn;
@@ -137,7 +142,7 @@ struct mthca_cq {
struct mthca_srq {
struct ibv_srq ibv_srq;
- void *buf;
+ struct mthca_buf buf;
void *last;
pthread_spinlock_t lock;
struct ibv_mr *mr;
@@ -174,7 +179,7 @@ struct mthca_wq {
struct mthca_qp {
struct ibv_qp ibv_qp;
- void *buf;
+ struct mthca_buf buf;
uint64_t *wrid;
int send_wqe_offset;
int max_inline_data;
@@ -259,6 +264,9 @@ static inline int mthca_is_memfree(struc
return to_mdev(ibctx->device)->hca_type == MTHCA_ARBEL;
}
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
+void mthca_free_buf(struct mthca_buf *buf);
+
int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
uint32_t **db);
void mthca_set_db_qn(uint32_t *db, enum mthca_db_type type, uint32_t qn);
@@ -290,7 +298,7 @@ void mthca_arbel_cq_event(struct ibv_cq
void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn,
struct mthca_srq *srq);
void mthca_cq_resize_copy_cqes(struct mthca_cq *cq, void *buf, int new_cqe);
-void *mthca_alloc_cq_buf(struct mthca_device *dev, int cqe);
+int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent);
struct ibv_srq *mthca_create_srq(struct ibv_pd *pd,
struct ibv_srq_init_attr *attr);
Index: libmthca/src/cq.c
===================================================================
--- libmthca/src/cq.c (revision 8793)
+++ libmthca/src/cq.c (working copy)
@@ -126,7 +126,7 @@ struct mthca_err_cqe {
static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
{
- return cq->buf + entry * MTHCA_CQ_ENTRY_SIZE;
+ return cq->buf.buf + entry * MTHCA_CQ_ENTRY_SIZE;
}
static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
@@ -612,17 +612,16 @@ void mthca_cq_resize_copy_cqes(struct mt
get_cqe(cq, i & old_cqe), MTHCA_CQ_ENTRY_SIZE);
}
-void *mthca_alloc_cq_buf(struct mthca_device *dev, int nent)
+int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent)
{
- void *buf;
int i;
- if (posix_memalign(&buf, dev->page_size,
- align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size)))
- return NULL;
+ if (mthca_alloc_buf(buf, align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size),
+ dev->page_size))
+ return -1;
for (i = 0; i < nent; ++i)
- ((struct mthca_cqe *) buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
+ ((struct mthca_cqe *) buf->buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
- return buf;
+ return 0;
}
Index: libmthca/src/srq.c
===================================================================
--- libmthca/src/srq.c (revision 8793)
+++ libmthca/src/srq.c (working copy)
@@ -47,7 +47,7 @@
static void *get_wqe(struct mthca_srq *srq, int n)
{
- return srq->buf + (n << srq->wqe_shift);
+ return srq->buf.buf + (n << srq->wqe_shift);
}
/*
@@ -292,13 +292,14 @@ int mthca_alloc_srq_buf(struct ibv_pd *p
srq->buf_size = srq->max << srq->wqe_shift;
- if (posix_memalign(&srq->buf, to_mdev(pd->context->device)->page_size,
- align(srq->buf_size, to_mdev(pd->context->device)->page_size))) {
+ if (mthca_alloc_buf(&srq->buf,
+ align(srq->buf_size, to_mdev(pd->context->device)->page_size),
+ to_mdev(pd->context->device)->page_size)) {
free(srq->wrid);
return -1;
}
- memset(srq->buf, 0, srq->buf_size);
+ memset(srq->buf.buf, 0, srq->buf_size);
/*
* Now initialize the SRQ buffer so that all of the WQEs are
Index: libmthca/src/ah.c
===================================================================
--- libmthca/src/ah.c (revision 8793)
+++ libmthca/src/ah.c (working copy)
@@ -45,7 +45,7 @@
struct mthca_ah_page {
struct mthca_ah_page *prev, *next;
- void *buf;
+ struct mthca_buf buf;
struct ibv_mr *mr;
int use_cnt;
unsigned free[0];
@@ -60,14 +60,14 @@ static struct mthca_ah_page *__add_page(
if (!page)
return NULL;
- if (posix_memalign(&page->buf, page_size, page_size)) {
+ if (mthca_alloc_buf(&page->buf, page_size, page_size)) {
free(page);
return NULL;
}
- page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
+ page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf.buf, page_size, 0);
if (!page->mr) {
- free(page->buf);
+ mthca_free_buf(&page->buf);
free(page);
return NULL;
}
@@ -123,7 +123,7 @@ int mthca_alloc_av(struct mthca_pd *pd,
if (page->free[i]) {
j = ffs(page->free[i]);
page->free[i] &= ~(1 << (j - 1));
- ah->av = page->buf +
+ ah->av = page->buf.buf +
(i * 8 * sizeof (int) + (j - 1)) * sizeof *ah->av;
break;
}
@@ -172,7 +172,7 @@ void mthca_free_av(struct mthca_ah *ah)
pthread_mutex_lock(&pd->ah_mutex);
page = ah->page;
- i = ((void *) ah->av - page->buf) / sizeof *ah->av;
+ i = ((void *) ah->av - page->buf.buf) / sizeof *ah->av;
page->free[i / (8 * sizeof (int))] |= 1 << (i % (8 * sizeof (int)));
if (!--page->use_cnt) {
@@ -184,7 +184,7 @@ void mthca_free_av(struct mthca_ah *ah)
page->next->prev = page->prev;
mthca_dereg_mr(page->mr);
- free(page->buf);
+ mthca_free_buf(&page->buf);
free(page);
}
Index: libmthca/src/buf.c
===================================================================
--- libmthca/src/buf.c (revision 0)
+++ libmthca/src/buf.c (revision 0)
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+
+#include "mthca.h"
+
+#if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))
+
+/*
+ * If libibverbs isn't exporting these functions, then there's no
+ * point in doing it here, because the rest of libibverbs isn't going
+ * to be fork-safe anyway.
+ */
+static int ibv_dontfork_range(void *base, size_t size)
+{
+ return 0;
+}
+
+static int ibv_dofork_range(void *base, size_t size)
+{
+ return 0;
+}
+
+#endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */
+
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
+{
+ int ret;
+
+ ret = posix_memalign(&buf->buf, page_size, align(size, page_size));
+ if (ret)
+ return ret;
+
+ ret = ibv_dontfork_range(buf->buf, size);
+ if (ret)
+ free(buf->buf);
+
+ if (!ret)
+ buf->length = size;
+
+ return ret;
+}
+
+void mthca_free_buf(struct mthca_buf *buf)
+{
+ ibv_dofork_range(buf->buf, buf->length);
+ free(buf->buf);
+}
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+
+#include "mthca.h"
+
+#if !(defined(HAVE_IBV_DONTFORK_RANGE) && defined(HAVE_IBV_DOFORK_RANGE))
+
+/*
+ * If libibverbs isn't exporting these functions, then there's no
+ * point in doing it here, because the rest of libibverbs isn't going
+ * to be fork-safe anyway.
+ */
+static int ibv_dontfork_range(void *base, size_t size)
+{
+ return 0;
+}
+
+static int ibv_dofork_range(void *base, size_t size)
+{
+ return 0;
+}
+
+#endif /* HAVE_IBV_DONTFORK_RANGE && HAVE_IBV_DOFORK_RANGE */
+
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
+{
+ int ret;
+
+ ret = posix_memalign(&buf->buf, page_size, align(size, page_size));
+ if (ret)
+ return ret;
+
+ ret = ibv_dontfork_range(buf->buf, size);
+ if (ret)
+ free(buf->buf);
+
+ if (!ret)
+ buf->length = size;
+
+ return ret;
+}
+
+void mthca_free_buf(struct mthca_buf *buf)
+{
+ ibv_dofork_range(buf->buf, buf->length);
+ free(buf->buf);
+}
Index: libmthca/ChangeLog
===================================================================
--- libmthca/ChangeLog (revision 8793)
+++ libmthca/ChangeLog (working copy)
@@ -1,3 +1,19 @@
+2006-07-26 Roland Dreier <rdreier at cisco.com>
+
+ * src/mthca.h, src/ah.c, src/cq.c, src/memfree.c, src/qp.c,
+ src/srq.c, src/verbs.c: Convert internal allocations for AH pages
+ (for non-memfree HCAs), CQ buffers, doorbell pages (for memfree
+ HCAs), QP buffers and SRQ buffers to use the new buffer
+ allocator. This makes libmthca fork()-clean when built against
+ libibverbs 1.1.
+
+ * src/buf.c (mthca_alloc_buf, mthca_free_buf): Add new functions
+ to wrap up allocating page-aligned buffers. The new functions
+ will call ibv_dontfork_range()/ibv_dofork_range() to do proper
+ madvise()ing to handle fork(), if applicable.
+
+ * configure.in: Check for ibv_dontfork_range() and ibv_dontfork_range().
+
2006-07-04 Dotan Barak <dotanb at mellanox.co.il>
* src/verbs.c (mthca_create_cq, mthca_resize_cq): Passing huge
Index: libmthca/Makefile.am
===================================================================
--- libmthca/Makefile.am (revision 8793)
+++ libmthca/Makefile.am (working copy)
@@ -12,10 +12,9 @@ else
mthca_version_script =
endif
-src_mthca_la_SOURCES = src/ah.c src/cq.c src/memfree.c src/mthca.c src/qp.c \
- src/srq.c src/verbs.c
-src_mthca_la_LDFLAGS = -avoid-version -module \
- $(mthca_version_script)
+src_mthca_la_SOURCES = src/ah.c src/buf.c src/cq.c src/memfree.c src/mthca.c \
+ src/qp.c src/srq.c src/verbs.c
+src_mthca_la_LDFLAGS = -avoid-version -module $(mthca_version_script)
DEBIAN = debian/changelog debian/compat debian/control debian/copyright \
debian/libmthca1.install debian/libmthca-dev.install debian/rules
More information about the general
mailing list