[openib-general] [PATCH/RFC] libibverbs and libmthca fork support
glebn at voltaire.com
glebn at voltaire.com
Tue Aug 1 06:17:56 PDT 2006
On Mon, Jul 31, 2006 at 11:52:18AM -0700, Roland Dreier wrote:
> Here's an initial cut (based on Gleb Natapov's work) at using
> madvise(MADV_DONTFORK) to support fork() from libibverbs. The main
> changes from Gleb's earlier work are:
>
> - I added code to handle doorbell pages in libmthca. As far as I can
> see this is necessary -- my tests don't work without it. Gleb, did
> you ever test your changes on memfree HCAs?
>
Nope. And when I test it now, it doesn't work. Bringing your fix over
helps though :)
> - I added a new API function, ibv_fork_init(), which must be called
> before everything else if an app expects to do fork(). I did this
> because I wanted a way for apps to know if fork() was expected to
> work or not, and also because the vast majority of apps don't
> fork() and probably don't want to pay the price of an extra system
> call plus RB tree operation for every memory registration.
>
> - And the bulk of this patch is converting memory.c over to use RB
> trees -- I just couldn't bring myself to use an O(N) algorithm at
> this stage...
>
That's excellent!
> Comments welcome...
>
You forgot to include buf.c in the patch.
> - R.
>
>
> Index: libibverbs/include/infiniband/driver.h
> ===================================================================
> --- libibverbs/include/infiniband/driver.h (revision 8791)
> +++ libibverbs/include/infiniband/driver.h (working copy)
> @@ -135,6 +135,9 @@ int ibv_cmd_destroy_ah(struct ibv_ah *ah
> int ibv_cmd_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
> int ibv_cmd_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
>
> +int ibv_dontfork_range(void *base, size_t size);
> +int ibv_dofork_range(void *base, size_t size);
> +
> /*
> * sysfs helper functions
> */
> Index: libibverbs/include/infiniband/verbs.h
> ===================================================================
> --- libibverbs/include/infiniband/verbs.h (revision 8791)
> +++ libibverbs/include/infiniband/verbs.h (working copy)
> @@ -285,6 +285,8 @@ struct ibv_pd {
> struct ibv_mr {
> struct ibv_context *context;
> struct ibv_pd *pd;
> + void *addr;
> + size_t length;
> uint32_t handle;
> uint32_t lkey;
> uint32_t rkey;
> @@ -1016,6 +1018,14 @@ int ibv_attach_mcast(struct ibv_qp *qp,
> */
> int ibv_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
>
> +/**
> + * ibv_fork_init - Prepare data structures so that fork() may be used
> + * safely. If this function is not called or returns a non-zero
> + * status, then libibverbs data structures are not fork()-safe and the
> + * effect of an application calling fork() is undefined.
> + */
> +int ibv_fork_init(void);
> +
> END_C_DECLS
>
> # undef __attribute_const
> Index: libibverbs/ChangeLog
> ===================================================================
> --- libibverbs/ChangeLog (revision 8791)
> +++ libibverbs/ChangeLog (working copy)
> @@ -1,3 +1,29 @@
> +2006-07-26 Roland Dreier <rdreier at cisco.com>
> +
> + * src/verbs.c (ibv_reg_mr, ibv_dereg_mr): Add calls to
> + ibv_dontfork_range() and ibv_dofork_range() for memory regions
> + registered by library consumers.
> +
> + * include/infiniband/verbs.h: Add declaration of ibv_fork_init().
> +
> + * include/infiniband/driver.h: Add declarations of
> + ibv_dontfork_range() and ibv_dofork_range().
> +
> + * src/memory.c: Rewrite to use a red-black tree instead of a
> + linked list. Change from doing mlock()/munlock() to
> + madvise(..., MADV_DONTFORK) and madvise(..., MADV_DOFORK), and
> + change the name of the entry points to ibv_dontfork_range() and
> + ibv_dofork_range(). Add ibv_fork_init() for applications to
> + request fork-safe behavior.
> +
> + * src/ibverbs.h: Kill off unused declarations.
> +
> + * src/init.c (ibverbs_init): Get rid of call to ibv_init_mem_map().
> +
> + * include/infiniband/verbs.h: Add addr and length field to struct
> + ibv_mr so that memory regions can be madvised(). This changes the
> + ABI, since the layout of struct ibv_mr is changed.
> +
> 2006-07-04 Roland Dreier <rdreier at cisco.com>
>
> * include/infiniband/arch.h: Fix typo in sparc mb()
> Index: libibverbs/src/libibverbs.map
> ===================================================================
> --- libibverbs/src/libibverbs.map (revision 8791)
> +++ libibverbs/src/libibverbs.map (working copy)
> @@ -74,6 +74,9 @@ IBVERBS_1.0 {
> mult_to_ibv_rate;
> ibv_get_sysfs_path;
> ibv_read_sysfs_file;
> + ibv_fork_init;
> + ibv_dontfork_range;
> + ibv_dofork_range;
>
> local: *;
> };
> Index: libibverbs/src/ibverbs.h
> ===================================================================
> --- libibverbs/src/ibverbs.h (revision 8791)
> +++ libibverbs/src/ibverbs.h (working copy)
> @@ -58,11 +58,7 @@ struct ibv_abi_compat_v2 {
>
> extern HIDDEN int abi_ver;
>
> -extern HIDDEN int ibverbs_init(struct ibv_device ***list);
> -
> -extern HIDDEN int ibv_init_mem_map(void);
> -extern HIDDEN int ibv_lock_range(void *base, size_t size);
> -extern HIDDEN int ibv_unlock_range(void *base, size_t size);
> +HIDDEN int ibverbs_init(struct ibv_device ***list);
>
> #define IBV_INIT_CMD(cmd, size, opcode) \
> do { \
> Index: libibverbs/src/verbs.c
> ===================================================================
> --- libibverbs/src/verbs.c (revision 8791)
> +++ libibverbs/src/verbs.c (working copy)
> @@ -155,18 +155,32 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd
> {
> struct ibv_mr *mr;
>
> + if (ibv_dontfork_range(addr, length))
> + return NULL;
> +
> mr = pd->context->ops.reg_mr(pd, addr, length, access);
> if (mr) {
> mr->context = pd->context;
> mr->pd = pd;
> - }
> + mr->addr = addr;
> + mr->length = length;
> + } else
> + ibv_dofork_range(addr, length);
>
> return mr;
> }
>
> int ibv_dereg_mr(struct ibv_mr *mr)
> {
> - return mr->context->ops.dereg_mr(mr);
> + int ret;
> + void *addr = mr->addr;
> + size_t length = mr->length;
> +
> + ret = mr->context->ops.dereg_mr(mr);
> + if (!ret)
> + ibv_dofork_range(addr, length);
> +
> + return ret;
> }
>
> static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
> Index: libibverbs/src/init.c
> ===================================================================
> --- libibverbs/src/init.c (revision 8791)
> +++ libibverbs/src/init.c (working copy)
> @@ -205,9 +205,6 @@ HIDDEN int ibverbs_init(struct ibv_devic
>
> *list = NULL;
>
> - if (ibv_init_mem_map())
> - return 0;
> -
> find_drivers(default_path);
>
> /*
> Index: libibverbs/src/memory.c
> ===================================================================
> --- libibverbs/src/memory.c (revision 8791)
> +++ libibverbs/src/memory.c (working copy)
> @@ -1,5 +1,6 @@
> /*
> * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
> + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
> *
> * This software is available to you under a choice of one of two
> * licenses. You may choose to be licensed under the terms of the GNU
> @@ -36,6 +37,7 @@
> # include <config.h>
> #endif /* HAVE_CONFIG_H */
>
> +#include <errno.h>
> #include <sys/mman.h>
> #include <unistd.h>
> #include <stdlib.h>
> @@ -44,114 +46,424 @@
> #include "ibverbs.h"
>
> /*
> - * We keep a linked list of page ranges that have been locked along with a
> - * reference count to manage overlapping registrations, etc.
> - *
> - * Eventually we should turn this into an RB-tree or something similar
> - * to avoid the O(n) cost of registering/unregistering memory.
> + * Most distro's headers don't have these yet.
> */
> +#ifndef MADV_DONTFORK
> +#define MADV_DONTFORK 10
> +#endif
> +
> +#ifndef MADV_DOFORK
> +#define MADV_DOFORK 11
> +#endif
>
> struct ibv_mem_node {
> - struct ibv_mem_node *prev, *next;
> - uintptr_t start, end;
> - int refcnt;
> + enum {
> + IBV_RED,
> + IBV_BLACK
> + } color;
> + struct ibv_mem_node *parent;
> + struct ibv_mem_node *left, *right;
> + uintptr_t start, end;
> + int refcnt;
> };
>
> -static struct {
> - struct ibv_mem_node *first;
> - pthread_mutex_t mutex;
> - uintptr_t page_size;
> -} mem_map;
> +static struct ibv_mem_node *mm_root;
> +static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
> +static int page_size;
> +static int too_late;
>
> -int ibv_init_mem_map(void)
> +int ibv_fork_init(void)
> {
> - struct ibv_mem_node *node = NULL;
> -
> - node = malloc(sizeof *node);
> - if (!node)
> - goto fail;
> -
> - node->prev = node->next = NULL;
> - node->start = 0;
> - node->end = UINTPTR_MAX;
> - node->refcnt = 0;
> + void *tmp;
>
> - mem_map.first = node;
> + if (mm_root)
> + return 0;
>
> - mem_map.page_size = sysconf(_SC_PAGESIZE);
> - if (mem_map.page_size < 0)
> - goto fail;
> + if (too_late)
> + return EINVAL;
>
> - if (pthread_mutex_init(&mem_map.mutex, NULL))
> - goto fail;
> + page_size = sysconf(_SC_PAGESIZE);
> + if (page_size < 0)
> + return errno;
> +
> + if (posix_memalign(&tmp, page_size, page_size))
> + return ENOMEM;
> +
> + if (madvise(tmp, page_size, MADV_DONTFORK) ||
> + madvise(tmp, page_size, MADV_DOFORK))
> + return ENOSYS;
> +
> + free(tmp);
> +
> + mm_root = malloc(sizeof *mm_root);
> + if (!mm_root)
> + return ENOMEM;
> +
> + mm_root->parent = NULL;
> + mm_root->left = NULL;
> + mm_root->right = NULL;
> + mm_root->color = IBV_BLACK;
> + mm_root->start = 0;
> + mm_root->end = UINTPTR_MAX;
> + mm_root->refcnt = 0;
>
> return 0;
> +}
>
> -fail:
> - if (node)
> - free(node);
> +static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
> +{
> + if (node->left) {
> + node = node->left;
> + while (node->right)
> + node = node->right;
> + } else {
> + while (node->parent && node == node->parent->left)
> + node = node->parent;
>
> - return -1;
> + node = node->parent;
> + }
> +
> + return node;
> }
>
> -static struct ibv_mem_node *__mm_find_first(uintptr_t start, uintptr_t end)
> +static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
> {
> - struct ibv_mem_node *node = mem_map.first;
> + if (node->right) {
> + node = node->right;
> + while (node->left)
> + node = node->left;
> + } else {
> + while (node->parent && node == node->parent->right)
> + node = node->parent;
>
> - while (node) {
> - if ((node->start <= start && node->end >= start) ||
> - (node->start <= end && node->end >= end))
> - break;
> - node = node->next;
> + node = node->parent;
> }
>
> return node;
> }
>
> -static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
> +static void __mm_rotate_right(struct ibv_mem_node *node)
> {
> - return node->prev;
> + struct ibv_mem_node *tmp;
> +
> + tmp = node->left;
> +
> + node->left = tmp->right;
> + if (node->left)
> + node->left->parent = node;
> +
> + if (node->parent) {
> + if (node->parent->right == node)
> + node->parent->right = tmp;
> + else
> + node->parent->left = tmp;
> + } else
> + mm_root = tmp;
> +
> + tmp->parent = node->parent;
> +
> + tmp->right = node;
> + node->parent = tmp;
> }
>
> -static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
> +static void __mm_rotate_left(struct ibv_mem_node *node)
> +{
> + struct ibv_mem_node *tmp;
> +
> + tmp = node->right;
> +
> + node->right = tmp->left;
> + if (node->right)
> + node->right->parent = node;
> +
> + if (node->parent) {
> + if (node->parent->right == node)
> + node->parent->right = tmp;
> + else
> + node->parent->left = tmp;
> + } else
> + mm_root = tmp;
> +
> + tmp->parent = node->parent;
> +
> + tmp->left = node;
> + node->parent = tmp;
> +}
> +
> +static int verify(struct ibv_mem_node *node)
> +{
> + int hl, hr;
> +
> + if (!node)
> + return 1;
> +
> + hl = verify(node->left);
> + hr = verify(node->left);
> +
> + if (!hl || !hr)
> + return 0;
> + if (hl != hr)
> + return 0;
> +
> + if (node->color == IBV_RED) {
> + if (node->left && node->left->color != IBV_BLACK)
> + return 0;
> + if (node->right && node->right->color != IBV_BLACK)
> + return 0;
> + return hl;
> + }
> +
> + return hl + 1;
> +}
> +
> +static void __mm_add_rebalance(struct ibv_mem_node *node)
> {
> - return node->next;
> + struct ibv_mem_node *parent, *gp, *uncle;
> +
> + while (node->parent && node->parent->color == IBV_RED) {
> + parent = node->parent;
> + gp = node->parent->parent;
> +
> + if (parent == gp->left) {
> + uncle = gp->right;
> +
> + if (uncle && uncle->color == IBV_RED) {
> + parent->color = IBV_BLACK;
> + uncle->color = IBV_BLACK;
> + gp->color = IBV_RED;
> +
> + node = gp;
> + } else {
> + if (node == parent->right) {
> + __mm_rotate_left(parent);
> + node = parent;
> + parent = node->parent;
> + }
> +
> + parent->color = IBV_BLACK;
> + gp->color = IBV_RED;
> +
> + __mm_rotate_right(gp);
> + }
> + } else {
> + uncle = gp->left;
> +
> + if (uncle && uncle->color == IBV_RED) {
> + parent->color = IBV_BLACK;
> + uncle->color = IBV_BLACK;
> + gp->color = IBV_RED;
> +
> + node = gp;
> + } else {
> + if (node == parent->left) {
> + __mm_rotate_right(parent);
> + node = parent;
> + parent = node->parent;
> + }
> +
> + parent->color = IBV_BLACK;
> + gp->color = IBV_RED;
> +
> + __mm_rotate_left(gp);
> + }
> + }
> + }
> +
> + mm_root->color = IBV_BLACK;
> }
>
> -static void __mm_add(struct ibv_mem_node *node,
> - struct ibv_mem_node *new)
> +static void __mm_add(struct ibv_mem_node *new)
> {
> - new->prev = node;
> - new->next = node->next;
> - node->next = new;
> - if (new->next)
> - new->next->prev = new;
> + struct ibv_mem_node *node, *parent = NULL;
> +
> + node = mm_root;
> + while (node) {
> + parent = node;
> + if (node->start < new->start)
> + node = node->right;
> + else
> + node = node->left;
> + }
> +
> + if (parent->start < new->start)
> + parent->right = new;
> + else
> + parent->left = new;
> +
> + new->parent = parent;
> + new->left = NULL;
> + new->right = NULL;
> +
> + new->color = IBV_RED;
> + __mm_add_rebalance(new);
> }
>
> static void __mm_remove(struct ibv_mem_node *node)
> {
> - /* Never have to remove the first node, so we can use prev */
> - node->prev->next = node->next;
> - if (node->next)
> - node->next->prev = node->prev;
> + struct ibv_mem_node *child, *parent, *sib, *tmp;
> + int nodecol;
> +
> + if (node->left && node->right) {
> + tmp = node->left;
> + while (tmp->right)
> + tmp = tmp->right;
> +
> + nodecol = tmp->color;
> + child = tmp->left;
> + tmp->color = node->color;
> +
> + if (tmp->parent != node) {
> + parent = tmp->parent;
> + parent->right = tmp->left;
> + if (tmp->left)
> + tmp->left->parent = parent;
> +
> + tmp->left = node->left;
> + node->left->parent = tmp;
> + } else
> + parent = tmp;
> +
> + tmp->right = node->right;
> + node->right->parent = tmp;
> +
> + tmp->parent = node->parent;
> + if (node->parent) {
> + if (node->parent->left == node)
> + node->parent->left = tmp;
> + else
> + node->parent->right = tmp;
> + } else
> + mm_root = tmp;
> + } else {
> + nodecol = node->color;
> +
> + child = node->left ? node->left : node->right;
> + parent = node->parent;
> +
> + if (child)
> + child->parent = parent;
> + if (parent) {
> + if (parent->left == node)
> + parent->left = child;
> + else
> + parent->right = child;
> + } else
> + mm_root = child;
> + }
> +
> + free(node);
> +
> + if (nodecol == IBV_RED)
> + return;
> +
> + while ((!child || child->color == IBV_BLACK) && child != mm_root) {
> + if (parent->left == child) {
> + sib = parent->right;
> +
> + if (sib->color == IBV_RED) {
> + parent->color = IBV_RED;
> + sib->color = IBV_BLACK;
> + __mm_rotate_left(parent);
> + sib = parent->right;
> + }
> +
> + if ((!sib->left || sib->left->color == IBV_BLACK) &&
> + (!sib->right || sib->right->color == IBV_BLACK)) {
> + sib->color = IBV_RED;
> + child = parent;
> + parent = child->parent;
> + } else {
> + if (!sib->right || sib->right->color == IBV_BLACK) {
> + if (sib->left)
> + sib->left->color = IBV_BLACK;
> + sib->color = IBV_RED;
> + __mm_rotate_right(sib);
> + sib = parent->right;
> + }
> +
> + sib->color = parent->color;
> + parent->color = IBV_BLACK;
> + if (sib->right)
> + sib->right->color = IBV_BLACK;
> + __mm_rotate_left(parent);
> + child = mm_root;
> + break;
> + }
> + } else {
> + sib = parent->left;
> +
> + if (sib->color == IBV_RED) {
> + parent->color = IBV_RED;
> + sib->color = IBV_BLACK;
> + __mm_rotate_right(parent);
> + sib = parent->left;
> + }
> +
> + if ((!sib->left || sib->left->color == IBV_BLACK) &&
> + (!sib->right || sib->right->color == IBV_BLACK)) {
> + sib->color = IBV_RED;
> + child = parent;
> + parent = child->parent;
> + } else {
> + if (!sib->left || sib->left->color == IBV_BLACK) {
> + if (sib->right)
> + sib->right->color = IBV_BLACK;
> + sib->color = IBV_RED;
> + __mm_rotate_left(sib);
> + sib = parent->left;
> + }
> +
> + sib->color = parent->color;
> + parent->color = IBV_BLACK;
> + if (sib->left)
> + sib->left->color = IBV_BLACK;
> + __mm_rotate_right(parent);
> + child = mm_root;
> + break;
> + }
> + }
> + }
> +
> + if (child)
> + child->color = IBV_BLACK;
> +}
> +
> +static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
> +{
> + struct ibv_mem_node *node = mm_root;
> +
> + while (node) {
> + if (node->start <= start && node->end >= start)
> + break;
> +
> + if (node->start < start)
> + node = node->right;
> + else
> + node = node->left;
> + }
> +
> + return node;
> }
>
> -int ibv_lock_range(void *base, size_t size)
> +static int ibv_madvise_range(void *base, size_t size, int advice)
> {
> uintptr_t start, end;
> struct ibv_mem_node *node, *tmp;
> + int inc;
> int ret = 0;
>
> if (!size)
> return 0;
>
> - start = (uintptr_t) base & ~(mem_map.page_size - 1);
> - end = ((uintptr_t) (base + size + mem_map.page_size - 1) &
> - ~(mem_map.page_size - 1)) - 1;
> + inc = advice == MADV_DONTFORK ? 1 : -1;
> +
> + start = (uintptr_t) base & ~(page_size - 1);
> + end = ((uintptr_t) (base + size + page_size - 1) &
> + ~(page_size - 1)) - 1;
>
> - pthread_mutex_lock(&mem_map.mutex);
> + pthread_mutex_lock(&mm_mutex);
>
> - node = __mm_find_first(start, end);
> + node = __mm_find_start(start, end);
>
> if (node->start < start) {
> tmp = malloc(sizeof *tmp);
> @@ -165,11 +477,19 @@ int ibv_lock_range(void *base, size_t si
> tmp->refcnt = node->refcnt;
> node->end = start - 1;
>
> - __mm_add(node, tmp);
> + __mm_add(tmp);
> node = tmp;
> + } else {
> + tmp = __mm_prev(node);
> + if (tmp && tmp->refcnt == node->refcnt + inc) {
> + tmp->end = node->end;
> + tmp->refcnt = node->refcnt;
> + __mm_remove(node);
> + node = tmp;
> + }
> }
>
> - while (node->start <= end) {
> + while (node && node->start <= end) {
> if (node->end > end) {
> tmp = malloc(sizeof *tmp);
> if (!tmp) {
> @@ -182,13 +502,16 @@ int ibv_lock_range(void *base, size_t si
> tmp->refcnt = node->refcnt;
> node->end = end;
>
> - __mm_add(node, tmp);
> + __mm_add(tmp);
> }
>
> + node->refcnt += inc;
>
> - if (node->refcnt++ == 0) {
> - ret = mlock((void *) node->start,
> - node->end - node->start + 1);
> + if ((inc == -1 && node->refcnt == 0) ||
> + (inc == 1 && node->refcnt == 1)) {
> + ret = madvise((void *) node->start,
> + node->end - node->start + 1,
> + advice);
> if (ret)
> goto out;
> }
> @@ -196,63 +519,36 @@ int ibv_lock_range(void *base, size_t si
> node = __mm_next(node);
> }
>
> + if (node) {
> + tmp = __mm_prev(node);
> + if (tmp && node->refcnt == tmp->refcnt) {
> + tmp->end = node->end;
> + __mm_remove(node);
> + }
> + }
> +
> out:
> - pthread_mutex_unlock(&mem_map.mutex);
> + pthread_mutex_unlock(&mm_mutex);
>
> return ret;
> }
>
> -int ibv_unlock_range(void *base, size_t size)
> +int ibv_dontfork_range(void *base, size_t size)
> {
> - uintptr_t start, end;
> - struct ibv_mem_node *node, *tmp;
> - int ret = 0;
> -
> - if (!size)
> + if (mm_root)
> + return ibv_madvise_range(base, size, MADV_DONTFORK);
> + else {
> + too_late = 1;
> return 0;
> -
> - start = (uintptr_t) base & ~(mem_map.page_size - 1);
> - end = ((uintptr_t) (base + size + mem_map.page_size - 1) &
> - ~(mem_map.page_size - 1)) - 1;
> -
> - pthread_mutex_lock(&mem_map.mutex);
> -
> - node = __mm_find_first(start, end);
> -
> - if (node->start != start) {
> - ret = -1;
> - goto out;
> - }
> -
> - while (node && node->end <= end) {
> - if (--node->refcnt == 0) {
> - ret = munlock((void *) node->start,
> - node->end - node->start + 1);
> - }
> -
> - if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
> - __mm_prev(node)->end = node->end;
> - tmp = __mm_prev(node);
> - __mm_remove(node);
> - node = tmp;
> - }
> -
> - node = __mm_next(node);
> - }
> -
> - if (node && node->refcnt == __mm_prev(node)->refcnt) {
> - __mm_prev(node)->end = node->end;
> - tmp = __mm_prev(node);
> - __mm_remove(node);
> }
> +}
>
> - if (node->end != end) {
> - ret = -1;
> - goto out;
> +int ibv_dofork_range(void *base, size_t size)
> +{
> + if (mm_root)
> + return ibv_madvise_range(base, size, MADV_DOFORK);
> + else {
> + too_late = 1;
> + return 0;
> }
> -
> -out:
> - pthread_mutex_unlock(&mem_map.mutex);
> -
> - return ret;
> }
> Index: libmthca/configure.in
> ===================================================================
> --- libmthca/configure.in (revision 8791)
> +++ libmthca/configure.in (working copy)
> @@ -26,7 +26,7 @@ AC_C_CONST
> AC_CHECK_SIZEOF(long)
>
> dnl Checks for library functions
> -AC_CHECK_FUNCS(ibv_read_sysfs_file)
> +AC_CHECK_FUNCS(ibv_read_sysfs_file ibv_dontfork_range ibv_dofork_range)
>
> AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script,
> if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then
> Index: libmthca/src/memfree.c
> ===================================================================
> --- libmthca/src/memfree.c (revision 8791)
> +++ libmthca/src/memfree.c (working copy)
> @@ -46,8 +46,8 @@
> #define MTHCA_FREE_MAP_SIZE (MTHCA_DB_REC_PER_PAGE / (SIZEOF_LONG * 8))
>
> struct mthca_db_page {
> - unsigned long free[MTHCA_FREE_MAP_SIZE];
> - uint64_t *db_rec;
> + unsigned long free[MTHCA_FREE_MAP_SIZE];
> + struct mthca_buf db_rec;
> };
>
> struct mthca_db_table {
> @@ -91,7 +91,7 @@ int mthca_alloc_db(struct mthca_db_table
> }
>
> for (i = start; i != end; i += dir)
> - if (db_tab->page[i].db_rec)
> + if (db_tab->page[i].db_rec.buf)
> for (j = 0; j < MTHCA_FREE_MAP_SIZE; ++j)
> if (db_tab->page[i].free[j])
> goto found;
> @@ -101,18 +101,14 @@ int mthca_alloc_db(struct mthca_db_table
> goto out;
> }
>
> - {
> - void *tmp;
> -
> - if (posix_memalign(&tmp, MTHCA_DB_REC_PAGE_SIZE,
> - MTHCA_DB_REC_PAGE_SIZE)) {
> - ret = -1;
> - goto out;
> - }
> - db_tab->page[i].db_rec = tmp;
> + if (mthca_alloc_buf(&db_tab->page[i].db_rec,
> + MTHCA_DB_REC_PAGE_SIZE,
> + MTHCA_DB_REC_PAGE_SIZE)) {
> + ret = -1;
> + goto out;
> }
>
> - memset(db_tab->page[i].db_rec, 0, MTHCA_DB_REC_PAGE_SIZE);
> + memset(db_tab->page[i].db_rec.buf, 0, MTHCA_DB_REC_PAGE_SIZE);
> memset(db_tab->page[i].free, 0xff, sizeof db_tab->page[i].free);
>
> if (group == 0)
> @@ -140,7 +136,7 @@ found:
> j = MTHCA_DB_REC_PER_PAGE - 1 - j;
>
> ret = i * MTHCA_DB_REC_PER_PAGE + j;
> - *db = (uint32_t *) &db_tab->page[i].db_rec[j];
> + *db = db_tab->page[i].db_rec.buf + j * 8;
>
> out:
> pthread_mutex_unlock(&db_tab->mutex);
> @@ -163,7 +159,7 @@ void mthca_free_db(struct mthca_db_table
> page = db_tab->page + i;
>
> pthread_mutex_lock(&db_tab->mutex);
> - page->db_rec[j] = 0;
> + *(uint64_t *) (page->db_rec.buf + j * 8) = 0;
>
> if (i >= db_tab->min_group2)
> j = MTHCA_DB_REC_PER_PAGE - 1 - j;
> @@ -190,7 +186,7 @@ struct mthca_db_table *mthca_alloc_db_ta
> db_tab->min_group2 = npages - 1;
>
> for (i = 0; i < npages; ++i)
> - db_tab->page[i].db_rec = NULL;
> + db_tab->page[i].db_rec.buf = NULL;
>
> return db_tab;
> }
> @@ -203,8 +199,8 @@ void mthca_free_db_tab(struct mthca_db_t
> return;
>
> for (i = 0; i < db_tab->npages; ++i)
> - if (db_tab->page[i].db_rec)
> - free(db_tab->page[i].db_rec);
> + if (db_tab->page[i].db_rec.buf)
> + mthca_free_buf(&db_tab->page[i].db_rec);
>
> free(db_tab);
> }
> Index: libmthca/src/qp.c
> ===================================================================
> --- libmthca/src/qp.c (revision 8791)
> +++ libmthca/src/qp.c (working copy)
> @@ -58,12 +58,12 @@ static const uint8_t mthca_opcode[] = {
>
> static void *get_recv_wqe(struct mthca_qp *qp, int n)
> {
> - return qp->buf + (n << qp->rq.wqe_shift);
> + return qp->buf.buf + (n << qp->rq.wqe_shift);
> }
>
> static void *get_send_wqe(struct mthca_qp *qp, int n)
> {
> - return qp->buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
> + return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
> }
>
> void mthca_init_qp_indices(struct mthca_qp *qp)
> @@ -821,13 +821,14 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd
>
> qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
>
> - if (posix_memalign(&qp->buf, to_mdev(pd->context->device)->page_size,
> - align(qp->buf_size, to_mdev(pd->context->device)->page_size))) {
> + if (mthca_alloc_buf(&qp->buf,
> + align(qp->buf_size, to_mdev(pd->context->device)->page_size),
> + to_mdev(pd->context->device)->page_size)) {
> free(qp->wrid);
> return -1;
> }
>
> - memset(qp->buf, 0, qp->buf_size);
> + memset(qp->buf.buf, 0, qp->buf_size);
>
> if (mthca_is_memfree(pd->context)) {
> struct mthca_next_seg *next;
> Index: libmthca/src/verbs.c
> ===================================================================
> --- libmthca/src/verbs.c (revision 8791)
> +++ libmthca/src/verbs.c (working copy)
> @@ -188,11 +188,10 @@ struct ibv_cq *mthca_create_cq(struct ib
> goto err;
>
> cqe = align_cq_size(cqe);
> - cq->buf = mthca_alloc_cq_buf(to_mdev(context->device), cqe);
> - if (!cq->buf)
> + if (mthca_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe))
> goto err;
>
> - cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf,
> + cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf.buf,
> cqe * MTHCA_CQ_ENTRY_SIZE,
> 0, IBV_ACCESS_LOCAL_WRITE);
> if (!cq->mr)
> @@ -251,7 +250,7 @@ err_unreg:
> mthca_dereg_mr(cq->mr);
>
> err_buf:
> - free(cq->buf);
> + mthca_free_buf(&cq->buf);
>
> err:
> free(cq);
> @@ -264,7 +263,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
> struct mthca_cq *cq = to_mcq(ibcq);
> struct mthca_resize_cq cmd;
> struct ibv_mr *mr;
> - void *buf;
> + struct mthca_buf buf;
> int old_cqe;
> int ret;
>
> @@ -280,17 +279,15 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
> goto out;
> }
>
> - buf = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), cqe);
> - if (!buf) {
> - ret = ENOMEM;
> + ret = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe);
> + if (ret)
> goto out;
> - }
>
> - mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf,
> + mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf.buf,
> cqe * MTHCA_CQ_ENTRY_SIZE,
> 0, IBV_ACCESS_LOCAL_WRITE);
> if (!mr) {
> - free(buf);
> + mthca_free_buf(&buf);
> ret = ENOMEM;
> goto out;
> }
> @@ -303,14 +300,14 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
> ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
> if (ret) {
> mthca_dereg_mr(mr);
> - free(buf);
> + mthca_free_buf(&buf);
> goto out;
> }
>
> - mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
> + mthca_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
>
> mthca_dereg_mr(cq->mr);
> - free(cq->buf);
> + mthca_free_buf(&cq->buf);
>
> cq->buf = buf;
> cq->mr = mr;
> @@ -336,8 +333,7 @@ int mthca_destroy_cq(struct ibv_cq *cq)
> }
>
> mthca_dereg_mr(to_mcq(cq)->mr);
> -
> - free(to_mcq(cq)->buf);
> + mthca_free_buf(&to_mcq(cq)->buf);
> free(to_mcq(cq));
>
> return 0;
> @@ -389,7 +385,7 @@ struct ibv_srq *mthca_create_srq(struct
> if (mthca_alloc_srq_buf(pd, &attr->attr, srq))
> goto err;
>
> - srq->mr = __mthca_reg_mr(pd, srq->buf, srq->buf_size, 0, 0);
> + srq->mr = __mthca_reg_mr(pd, srq->buf.buf, srq->buf_size, 0, 0);
> if (!srq->mr)
> goto err_free;
>
> @@ -430,7 +426,7 @@ err_unreg:
>
> err_free:
> free(srq->wrid);
> - free(srq->buf);
> + mthca_free_buf(&srq->buf);
>
> err:
> free(srq);
> @@ -469,7 +465,7 @@ int mthca_destroy_srq(struct ibv_srq *sr
>
> mthca_dereg_mr(to_msrq(srq)->mr);
>
> - free(to_msrq(srq)->buf);
> + mthca_free_buf(&to_msrq(srq)->buf);
> free(to_msrq(srq)->wrid);
> free(to_msrq(srq));
>
> @@ -507,7 +503,7 @@ struct ibv_qp *mthca_create_qp(struct ib
> pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
> goto err_free;
>
> - qp->mr = __mthca_reg_mr(pd, qp->buf, qp->buf_size, 0, 0);
> + qp->mr = __mthca_reg_mr(pd, qp->buf.buf, qp->buf_size, 0, 0);
> if (!qp->mr)
> goto err_free;
>
> @@ -574,7 +570,7 @@ err_unreg:
>
> err_free:
> free(qp->wrid);
> - free(qp->buf);
> + mthca_free_buf(&qp->buf);
>
> err:
> free(qp);
> @@ -655,8 +651,7 @@ int mthca_destroy_qp(struct ibv_qp *qp)
> }
>
> mthca_dereg_mr(to_mqp(qp)->mr);
> -
> - free(to_mqp(qp)->buf);
> + mthca_free_buf(&to_mqp(qp)->buf);
> free(to_mqp(qp)->wrid);
> free(to_mqp(qp));
>
> Index: libmthca/src/mthca.h
> ===================================================================
> --- libmthca/src/mthca.h (revision 8791)
> +++ libmthca/src/mthca.h (working copy)
> @@ -112,6 +112,11 @@ struct mthca_context {
> int qp_table_mask;
> };
>
> +struct mthca_buf {
> + void *buf;
> + size_t length;
> +};
> +
> struct mthca_pd {
> struct ibv_pd ibv_pd;
> struct mthca_ah_page *ah_list;
> @@ -121,7 +126,7 @@ struct mthca_pd {
>
> struct mthca_cq {
> struct ibv_cq ibv_cq;
> - void *buf;
> + struct mthca_buf buf;
> pthread_spinlock_t lock;
> struct ibv_mr *mr;
> uint32_t cqn;
> @@ -137,7 +142,7 @@ struct mthca_cq {
>
> struct mthca_srq {
> struct ibv_srq ibv_srq;
> - void *buf;
> + struct mthca_buf buf;
> void *last;
> pthread_spinlock_t lock;
> struct ibv_mr *mr;
> @@ -174,7 +179,7 @@ struct mthca_wq {
>
> struct mthca_qp {
> struct ibv_qp ibv_qp;
> - void *buf;
> + struct mthca_buf buf;
> uint64_t *wrid;
> int send_wqe_offset;
> int max_inline_data;
> @@ -259,6 +264,9 @@ static inline int mthca_is_memfree(struc
> return to_mdev(ibctx->device)->hca_type == MTHCA_ARBEL;
> }
>
> +int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
> +void mthca_free_buf(struct mthca_buf *buf);
> +
> int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
> uint32_t **db);
> void mthca_set_db_qn(uint32_t *db, enum mthca_db_type type, uint32_t qn);
> @@ -290,7 +298,7 @@ void mthca_arbel_cq_event(struct ibv_cq
> void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn,
> struct mthca_srq *srq);
> void mthca_cq_resize_copy_cqes(struct mthca_cq *cq, void *buf, int new_cqe);
> -void *mthca_alloc_cq_buf(struct mthca_device *dev, int cqe);
> +int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent);
>
> struct ibv_srq *mthca_create_srq(struct ibv_pd *pd,
> struct ibv_srq_init_attr *attr);
> Index: libmthca/src/cq.c
> ===================================================================
> --- libmthca/src/cq.c (revision 8791)
> +++ libmthca/src/cq.c (working copy)
> @@ -126,7 +126,7 @@ struct mthca_err_cqe {
>
> static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
> {
> - return cq->buf + entry * MTHCA_CQ_ENTRY_SIZE;
> + return cq->buf.buf + entry * MTHCA_CQ_ENTRY_SIZE;
> }
>
> static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
> @@ -612,17 +612,16 @@ void mthca_cq_resize_copy_cqes(struct mt
> get_cqe(cq, i & old_cqe), MTHCA_CQ_ENTRY_SIZE);
> }
>
> -void *mthca_alloc_cq_buf(struct mthca_device *dev, int nent)
> +int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent)
> {
> - void *buf;
> int i;
>
> - if (posix_memalign(&buf, dev->page_size,
> - align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size)))
> - return NULL;
> + if (mthca_alloc_buf(buf, align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size),
> + dev->page_size))
> + return -1;
>
> for (i = 0; i < nent; ++i)
> - ((struct mthca_cqe *) buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
> + ((struct mthca_cqe *) buf->buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
>
> - return buf;
> + return 0;
> }
> Index: libmthca/src/srq.c
> ===================================================================
> --- libmthca/src/srq.c (revision 8791)
> +++ libmthca/src/srq.c (working copy)
> @@ -47,7 +47,7 @@
>
> static void *get_wqe(struct mthca_srq *srq, int n)
> {
> - return srq->buf + (n << srq->wqe_shift);
> + return srq->buf.buf + (n << srq->wqe_shift);
> }
>
> /*
> @@ -292,13 +292,14 @@ int mthca_alloc_srq_buf(struct ibv_pd *p
>
> srq->buf_size = srq->max << srq->wqe_shift;
>
> - if (posix_memalign(&srq->buf, to_mdev(pd->context->device)->page_size,
> - align(srq->buf_size, to_mdev(pd->context->device)->page_size))) {
> + if (mthca_alloc_buf(&srq->buf,
> + align(srq->buf_size, to_mdev(pd->context->device)->page_size),
> + to_mdev(pd->context->device)->page_size)) {
> free(srq->wrid);
> return -1;
> }
>
> - memset(srq->buf, 0, srq->buf_size);
> + memset(srq->buf.buf, 0, srq->buf_size);
>
> /*
> * Now initialize the SRQ buffer so that all of the WQEs are
> Index: libmthca/src/ah.c
> ===================================================================
> --- libmthca/src/ah.c (revision 8791)
> +++ libmthca/src/ah.c (working copy)
> @@ -45,7 +45,7 @@
>
> struct mthca_ah_page {
> struct mthca_ah_page *prev, *next;
> - void *buf;
> + struct mthca_buf buf;
> struct ibv_mr *mr;
> int use_cnt;
> unsigned free[0];
> @@ -60,14 +60,14 @@ static struct mthca_ah_page *__add_page(
> if (!page)
> return NULL;
>
> - if (posix_memalign(&page->buf, page_size, page_size)) {
> + if (mthca_alloc_buf(&page->buf, page_size, page_size)) {
> free(page);
> return NULL;
> }
>
> - page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
> + page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf.buf, page_size, 0);
> if (!page->mr) {
> - free(page->buf);
> + mthca_free_buf(&page->buf);
> free(page);
> return NULL;
> }
> @@ -123,7 +123,7 @@ int mthca_alloc_av(struct mthca_pd *pd,
> if (page->free[i]) {
> j = ffs(page->free[i]);
> page->free[i] &= ~(1 << (j - 1));
> - ah->av = page->buf +
> + ah->av = page->buf.buf +
> (i * 8 * sizeof (int) + (j - 1)) * sizeof *ah->av;
> break;
> }
> @@ -172,7 +172,7 @@ void mthca_free_av(struct mthca_ah *ah)
> pthread_mutex_lock(&pd->ah_mutex);
>
> page = ah->page;
> - i = ((void *) ah->av - page->buf) / sizeof *ah->av;
> + i = ((void *) ah->av - page->buf.buf) / sizeof *ah->av;
> page->free[i / (8 * sizeof (int))] |= 1 << (i % (8 * sizeof (int)));
>
> if (!--page->use_cnt) {
> @@ -184,7 +184,7 @@ void mthca_free_av(struct mthca_ah *ah)
> page->next->prev = page->prev;
>
> mthca_dereg_mr(page->mr);
> - free(page->buf);
> + mthca_free_buf(&page->buf);
> free(page);
> }
>
> Index: libmthca/ChangeLog
> ===================================================================
> --- libmthca/ChangeLog (revision 8791)
> +++ libmthca/ChangeLog (working copy)
> @@ -1,3 +1,19 @@
> +2006-07-26 Roland Dreier <rdreier at cisco.com>
> +
> + * src/mthca.h, src/ah.c, src/cq.c, src/memfree.c, src/qp.c,
> + src/srq.c, src/verbs.c: Convert internal allocations for AH pages
> + (for non-memfree HCAs), CQ buffers, doorbell pages (for memfree
> + HCAs), QP buffers and SRQ buffers to use the new buffer
> + allocator. This makes libmthca fork()-clean when built against
> + libibverbs 1.1.
> +
> + * src/buf.c (mthca_alloc_buf, mthca_free_buf): Add new functions
> + to wrap up allocating page-aligned buffers. The new functions
> + will call ibv_dontfork_range()/ibv_dofork_range() to do proper
> + madvise()ing to handle fork(), if applicable.
> +
> + * configure.in: Check for ibv_dontfork_range() and ibv_dontfork_range().
> +
> 2006-07-04 Dotan Barak <dotanb at mellanox.co.il>
>
> * src/verbs.c (mthca_create_cq, mthca_resize_cq): Passing huge
> Index: libmthca/Makefile.am
> ===================================================================
> --- libmthca/Makefile.am (revision 8791)
> +++ libmthca/Makefile.am (working copy)
> @@ -12,10 +12,9 @@ else
> mthca_version_script =
> endif
>
> -src_mthca_la_SOURCES = src/ah.c src/cq.c src/memfree.c src/mthca.c src/qp.c \
> - src/srq.c src/verbs.c
> -src_mthca_la_LDFLAGS = -avoid-version -module \
> - $(mthca_version_script)
> +src_mthca_la_SOURCES = src/ah.c src/buf.c src/cq.c src/memfree.c src/mthca.c \
> + src/qp.c src/srq.c src/verbs.c
> +src_mthca_la_LDFLAGS = -avoid-version -module $(mthca_version_script)
>
> DEBIAN = debian/changelog debian/compat debian/control debian/copyright \
> debian/libmthca1.install debian/libmthca-dev.install debian/rules
>
> _______________________________________________
> openib-general mailing list
> openib-general at openib.org
> http://openib.org/mailman/listinfo/openib-general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
--
Gleb.
More information about the general
mailing list