[openib-general][PATCH][kdapl]: FMR and EVD patch
James Lentini
jlentini at netapp.com
Wed Aug 17 14:21:18 PDT 2005
Hi Guy,
I've committed all but one of the EVD changes in revision 3126.
I still need a little time to review all of your changes to
dapl_evd_modify_upcall and the FMR implementation.
james
On Wed, 17 Aug 2005, Guy German wrote:
> [kdapl]: this is the same FMR and EVD patch, sent before
> with modifications, after discussions with James.
>
> Signed-off-by: Guy German <guyg at voltaire.com>
>
> Index: ib/dapl_openib_util.c
> ===================================================================
> --- ib/dapl_openib_util.c (revision 3113)
> +++ ib/dapl_openib_util.c (working copy)
> @@ -138,7 +138,7 @@ int dapl_ib_mr_register_ia(struct dapl_i
> }
>
> int dapl_ib_mr_register_physical(struct dapl_ia *ia, struct dapl_lmr *lmr,
> - void *phys_addr, u64 length,
> + void *phys_addr, u32 page_count,
> enum dat_mem_priv_flags privileges)
> {
> int status;
> @@ -150,11 +150,11 @@ int dapl_ib_mr_register_physical(struct
> u64 *array;
>
> array = (u64 *) phys_addr;
> - buf_list = kmalloc(length * sizeof *buf_list, GFP_ATOMIC);
> + buf_list = kmalloc(page_count * sizeof *buf_list, GFP_ATOMIC);
> if (!buf_list)
> return -ENOMEM;
>
> - for (i = 0; i < length; i++) {
> + for (i = 0; i < page_count; i++) {
> buf_list[i].addr = array[i];
> buf_list[i].size = PAGE_SIZE;
> }
> @@ -163,7 +163,7 @@ int dapl_ib_mr_register_physical(struct
> acl = dapl_ib_convert_mem_privileges(privileges);
> acl |= IB_ACCESS_MW_BIND;
> mr = ib_reg_phys_mr(((struct dapl_pz *)lmr->param.pz)->pd,
> - buf_list, length, acl, &iova);
> + buf_list, page_count, acl, &iova);
> kfree(buf_list);
> if (IS_ERR(mr)) {
> status = PTR_ERR(mr);
> @@ -186,13 +186,58 @@ int dapl_ib_mr_register_physical(struct
>
> lmr->param.lmr_context = mr->lkey;
> lmr->param.rmr_context = mr->rkey;
> - lmr->param.registered_size = length * PAGE_SIZE;
> + lmr->param.registered_size = page_count * PAGE_SIZE;
> lmr->param.registered_address = array[0];
> lmr->mr = mr;
>
> dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
> "%s: (%p %d) got lkey 0x%x \n", __func__,
> - buf_list, length, lmr->param.lmr_context);
> + buf_list, page_count, lmr->param.lmr_context);
> + return 0;
> +}
> +
> +int dapl_ib_mr_register_fmr(struct dapl_ia *ia, struct dapl_lmr *lmr,
> + void *phys_addr, u32 page_count,
> + enum dat_mem_priv_flags privileges)
> +{
> + /* FIXME: this phase-1 implementation of fmr doesn't take "privileges"
> + into account. This is a security breech. */
> + u64 io_addr;
> + u64 *page_list;
> + struct ib_pool_fmr *mem;
> + int status;
> +
> + page_list = (u64 *)phys_addr;
> + io_addr = page_list[0];
> +
> + mem = ib_fmr_pool_map_phys (((struct dapl_pz *)lmr->param.pz)->fmr_pool,
> + page_list,
> + page_count,
> + &io_addr);
> + if (IS_ERR(mem)) {
> + status = (int)PTR_ERR(mem);
> + if (status != -EAGAIN)
> + dapl_dbg_log(DAPL_DBG_TYPE_ERR,
> + "fmr_pool_map_phys ret=%d <%d pages>\n",
> + status, page_count);
> +
> + lmr->param.registered_address = 0;
> + lmr->fmr = 0;
> + return status;
> + }
> +
> + lmr->param.lmr_context = mem->fmr->lkey;
> + lmr->param.rmr_context = mem->fmr->rkey;
> + lmr->param.registered_size = page_count * PAGE_SIZE;
> + lmr->param.registered_address = io_addr;
> + lmr->fmr = mem;
> +
> + dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
> + "%s: (addr=%p size=0x%x) lkey=0x%x rkey=0x%x\n", __func__,
> + lmr->param.registered_address,
> + lmr->param.registered_size,
> + lmr->param.lmr_context,
> + lmr->param.rmr_context);
> return 0;
> }
>
> @@ -222,7 +267,10 @@ int dapl_ib_mr_deregister(struct dapl_lm
> {
> int status;
>
> - status = ib_dereg_mr(lmr->mr);
> + if (DAT_MEM_TYPE_PLATFORM == lmr->param.mem_type && lmr->fmr)
> + status = ib_fmr_pool_unmap(lmr->fmr);
> + else
> + status = ib_dereg_mr(lmr->mr);
> if (status < 0) {
> dapl_dbg_log(DAPL_DBG_TYPE_ERR,
> " ib_dereg_mr error code return = %d\n",
> Index: ib/dapl_evd.c
> ===================================================================
> --- ib/dapl_evd.c (revision 3113)
> +++ ib/dapl_evd.c (working copy)
> @@ -42,19 +42,30 @@ static void dapl_evd_upcall_trigger(stru
> int status = 0;
> struct dat_event event;
>
> - /* Only process events if there is an enabled callback function. */
> - if ((evd->upcall.upcall_func == (DAT_UPCALL_FUNC) NULL) ||
> - (evd->upcall_policy == DAT_UPCALL_DISABLE))
> + /* DAT_UPCALL_MANY is not supported */
> + if (evd->is_triggered)
> return;
>
> - for (;;) {
> + spin_lock_irqsave (&evd->common.lock, evd->common.flags);
> + if (evd->is_triggered) {
> + spin_unlock_irqrestore (&evd->common.lock, evd->common.flags);
> + return;
> + }
> + evd->is_triggered = 1;
> + spin_unlock_irqrestore (&evd->common.lock, evd->common.flags);
> + /* Only process events if there is an enabled callback function */
> + while ((evd->upcall.upcall_func != (DAT_UPCALL_FUNC)NULL) &&
> + (evd->upcall_policy != DAT_UPCALL_TEARDOWN) &&
> + (evd->upcall_policy != DAT_UPCALL_DISABLE)) {
> status = dapl_evd_dequeue((struct dat_evd *)evd, &event);
> - if (0 != status)
> - return;
> -
> + if (status)
> + break;
> evd->upcall.upcall_func(evd->upcall.instance_data, &event,
> FALSE);
> }
> + evd->is_triggered = 0;
> +
> + return;
> }
>
> static void dapl_evd_eh_print_wc(struct ib_wc *wc)
> @@ -163,6 +174,7 @@ static struct dapl_evd *dapl_evd_alloc(s
> evd->cq = NULL;
> atomic_set(&evd->evd_ref_count, 0);
> evd->catastrophic_overflow = FALSE;
> + evd->is_triggered = 0;
> evd->qlen = qlen;
> evd->upcall_policy = upcall_policy;
> if ( NULL != upcall )
> @@ -798,25 +810,28 @@ static void dapl_evd_dto_callback(struct
> overflow);
>
> /*
> - * This function does not dequeue from the CQ; only the consumer
> - * can do that. It rearms the completion only if completions should
> - * always occur.
> + * This function does not dequeue from the CQ;
> + * It rearms the completion only if the consumer did not
> + * disable the upcall policy (in order to dequeu the rest
> + * of the events himself)
> */
>
> - if (!overflow && evd->upcall_policy != DAT_UPCALL_DISABLE) {
> - /*
> - * Re-enable callback, *then* trigger.
> - * This guarantees we won't miss any events.
> - */
> - status = ib_req_notify_cq(evd->cq, IB_CQ_NEXT_COMP);
> - if (0 != status)
> - (void)dapl_evd_post_async_error_event(
> - evd->common.owner_ia->async_error_evd,
> - DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR,
> - evd->common.owner_ia);
> -
> + if (!overflow) {
> dapl_evd_upcall_trigger(evd);
> + if ((evd->upcall_policy != DAT_UPCALL_TEARDOWN) &&
> + (evd->upcall_policy != DAT_UPCALL_DISABLE)) {
> + status = ib_req_notify_cq(evd->cq, IB_CQ_NEXT_COMP);
> + if (status)
> + (void)dapl_evd_post_async_error_event(
> + evd->common.owner_ia->async_error_evd,
> + DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR,
> + evd->common.owner_ia);
> + else
> + dapl_evd_upcall_trigger(evd);
> + }
> }
> + else
> + dapl_dbg_log(DAPL_DBG_TYPE_ERR, "%s: evd %p overflowed\n",evd);
> dapl_dbg_log(DAPL_DBG_TYPE_RTN, "%s() returns\n", __func__);
> }
>
> @@ -868,10 +883,11 @@ int dapl_evd_internal_create(struct dapl
>
> /* reset the qlen in the attributes, it may have changed */
> evd->qlen = evd->cq->cqe;
> + if ((evd->upcall_policy != DAT_UPCALL_TEARDOWN) &&
> + (evd->upcall_policy != DAT_UPCALL_DISABLE))
> + status = ib_req_notify_cq(evd->cq, IB_CQ_NEXT_COMP);
>
> - status = ib_req_notify_cq(evd->cq, IB_CQ_NEXT_COMP);
> -
> - if (status != 0)
> + if (status)
> goto bail;
> }
>
> @@ -879,14 +895,14 @@ int dapl_evd_internal_create(struct dapl
> * the EVD
> */
> status = dapl_evd_event_alloc(evd);
> - if (status != 0)
> + if (status)
> goto bail;
>
> dapl_ia_link_evd(ia, evd);
> *evd_ptr = evd;
>
> bail:
> - if (status != 0)
> + if (status)
> if (evd)
> dapl_evd_dealloc(evd);
>
> @@ -1012,15 +1028,40 @@ int dapl_evd_modify_upcall(struct dat_ev
> const struct dat_upcall_object *upcall)
> {
> struct dapl_evd *evd;
> -
> - dapl_dbg_log(DAPL_DBG_TYPE_API, "dapl_modify_upcall(%p)\n", evd_handle);
> + int status = 0;
> + int pending_events;
>
> evd = (struct dapl_evd *)evd_handle;
> + dapl_dbg_log (DAPL_DBG_TYPE_API, "%s: (evd=%p, upcall_policy=%d)\n",
> + __func__, evd_handle, upcall_policy);
>
> + spin_lock_irqsave(&evd->common.lock, evd->common.flags);
> + if ((upcall_policy != DAT_UPCALL_TEARDOWN) &&
> + (upcall_policy != DAT_UPCALL_DISABLE)) {
> + pending_events = dapl_rbuf_count(&evd->pending_event_queue);
> + if (pending_events) {
> + dapl_dbg_log (DAPL_DBG_TYPE_WARN,
> + "%s: (evd %p) there are still %d pending "
> + "events in the queue - policy stays disabled\n",
> + __func__, evd_handle, pending_events);
> + status = -EBUSY;
> + goto bail;
> + }
> + if (evd->evd_flags & DAT_EVD_DTO_FLAG) {
> + status = ib_req_notify_cq(evd->cq, IB_CQ_NEXT_COMP);
> + if (status) {
> + printk(KERN_ERR "%s: dapls_ib_completion_notify"
> + " failed (status=0x%x) \n",__func__,
> + status);
> + goto bail;
> + }
> + }
> + }
> evd->upcall_policy = upcall_policy;
> evd->upcall = *upcall;
> -
> - return 0;
> +bail:
> + spin_unlock_irqrestore(&evd->common.lock, evd->common.flags);
> + return status;
> }
>
> int dapl_evd_post_se(struct dat_evd *evd_handle, const struct dat_event *event)
> @@ -1090,7 +1131,6 @@ int dapl_evd_dequeue(struct dat_evd *evd
>
> bail:
> spin_unlock_irqrestore(&evd->common.lock, evd->common.flags);
> -
> dapl_dbg_log(DAPL_DBG_TYPE_RTN,
> "dapl_evd_dequeue () returns 0x%x\n", status);
>
> Index: ib/dapl_openib_util.h
> ===================================================================
> --- ib/dapl_openib_util.h (revision 3113)
> +++ ib/dapl_openib_util.h (working copy)
> @@ -84,9 +84,13 @@ int dapl_ib_mr_register_ia(struct dapl_i
> enum dat_mem_priv_flags privileges);
>
> int dapl_ib_mr_register_physical(struct dapl_ia *ia, struct dapl_lmr *lmr,
> - void *phys_addr, u64 length,
> + void *phys_addr, u32 page_count,
> enum dat_mem_priv_flags privileges);
>
> +int dapl_ib_mr_register_fmr(struct dapl_ia *ia, struct dapl_lmr *lmr,
> + void *phys_addr, u32 page_count,
> + enum dat_mem_priv_flags privileges);
> +
> int dapl_ib_mr_deregister(struct dapl_lmr *lmr);
>
> int dapl_ib_mr_register_shared(struct dapl_ia *ia, struct dapl_lmr *lmr,
> Index: ib/dapl.h
> ===================================================================
> --- ib/dapl.h (revision 3113)
> +++ ib/dapl.h (working copy)
> @@ -40,9 +40,9 @@
> #include <asm/atomic.h>
>
> #include <kdapl.h>
> -
> -#include "ib_verbs.h"
> -#include "ib_cm.h"
> +#include <ib_verbs.h>
> +#include <ib_cm.h>
> +#include <ib_fmr_pool.h>
>
> /*********************************************************************
> * *
> @@ -173,6 +173,7 @@ struct dapl_evd {
> struct dapl_ring_buffer pending_event_queue;
> enum dat_upcall_policy upcall_policy;
> struct dat_upcall_object upcall;
> + int is_triggered;
> };
>
> struct dapl_ep {
> @@ -229,6 +230,7 @@ struct dapl_pz {
> struct list_head list;
> struct ib_pd *pd;
> atomic_t pz_ref_count;
> + struct ib_fmr_pool *fmr_pool;
> };
>
> struct dapl_lmr {
> @@ -237,6 +239,7 @@ struct dapl_lmr {
> struct list_head list;
> struct dat_lmr_param param;
> struct ib_mr *mr;
> + struct ib_pool_fmr *fmr;
> atomic_t lmr_ref_count;
> };
>
> @@ -628,4 +631,6 @@ extern void dapl_dbg_log(enum dapl_dbg_t
> #define dapl_dbg_log(...)
> #endif /* KDAPL_INFINIBAND_DEBUG */
>
> +extern void set_fmr_params (struct ib_fmr_pool_param *fmr_param_s);
> +extern unsigned int g_dapl_active_fmr;
> #endif /* DAPL_H */
> Index: ib/dapl_pz.c
> ===================================================================
> --- ib/dapl_pz.c (revision 3113)
> +++ ib/dapl_pz.c (working copy)
> @@ -29,7 +29,7 @@
> /*
> * $Id$
> */
> -
> +#include <linux/delay.h>
> #include "dapl.h"
> #include "dapl_ia.h"
> #include "dapl_openib_util.h"
> @@ -89,7 +89,17 @@ int dapl_pz_create(struct dat_ia *ia, st
> status);
> goto error2;
> }
> -
> +
> + if (g_dapl_active_fmr) {
> + struct ib_fmr_pool_param params;
> + set_fmr_params (¶ms);
> + dapl_pz->fmr_pool = ib_create_fmr_pool(dapl_pz->pd, ¶ms);
> + if (IS_ERR(dapl_pz->fmr_pool))
> + dapl_dbg_log(DAPL_DBG_TYPE_WARN,
> + "could not create FMR pool <%ld>",
> + PTR_ERR(dapl_pz->fmr_pool));
> + }
> +
> *pz = (struct dat_pz *)dapl_pz;
> return 0;
>
> @@ -104,7 +114,7 @@ error1:
> int dapl_pz_free(struct dat_pz *pz)
> {
> struct dapl_pz *dapl_pz;
> - int status;
> + int status=0;
>
> dapl_dbg_log(DAPL_DBG_TYPE_API, "dapl_pz_free(%p)\n", pz);
>
> @@ -114,8 +124,15 @@ int dapl_pz_free(struct dat_pz *pz)
> status = -EINVAL;
> goto error;
> }
> -
> - status = ib_dealloc_pd(dapl_pz->pd);
> +
> + if (g_dapl_active_fmr && dapl_pz->fmr_pool) {
> + (void)ib_destroy_fmr_pool(dapl_pz->fmr_pool);
> + dapl_pz->fmr_pool = NULL;
> + }
> +
> + if (dapl_pz->pd)
> + status = ib_dealloc_pd(dapl_pz->pd);
> +
> if (status) {
> dapl_dbg_log(DAPL_DBG_TYPE_ERR, "ib_dealloc_pd failed: %X\n",
> status);
> Index: ib/dapl_ia.c
> ===================================================================
> --- ib/dapl_ia.c (revision 3113)
> +++ ib/dapl_ia.c (working copy)
> @@ -115,7 +115,6 @@ static int dapl_ia_abrupt_close(struct d
> * when we run out of entries, or when we get back to the head
> * if we end up skipping an entry.
> */
> -
> list_for_each_entry(rmr, &ia->rmr_list, list) {
> status = dapl_rmr_free((struct dat_rmr *)rmr);
> if (status != 0)
> @@ -196,7 +195,6 @@ static int dapl_ia_abrupt_close(struct d
> "ia_close(ABRUPT): psp_free(%p) returns %x\n",
> sp, status);
> }
> -
> list_for_each_entry(pz, &ia->pz_list, list) {
> status = dapl_pz_free((struct dat_pz *)pz);
> if (status != 0)
> @@ -266,7 +264,6 @@ static int dapl_ia_graceful_close(struct
> int status = 0;
> int cur_status;
> struct dapl_evd *evd;
> -
> if (!list_empty(&ia->rmr_list) ||
> !list_empty(&ia->rsp_list) ||
> !list_empty(&ia->ep_list) ||
> @@ -745,7 +742,8 @@ int dapl_ia_query(struct dat_ia *ia_ptr,
> provider_attr->provider_version_major = DAPL_PROVIDER_MAJOR;
> provider_attr->provider_version_minor = DAPL_PROVIDER_MINOR;
> provider_attr->lmr_mem_types_supported =
> - DAT_MEM_TYPE_PHYSICAL | DAT_MEM_TYPE_IA;
> + DAT_MEM_TYPE_PHYSICAL | DAT_MEM_TYPE_IA |
> + DAT_MEM_TYPE_PLATFORM;
> provider_attr->iov_ownership_on_return = DAT_IOV_CONSUMER;
> provider_attr->dat_qos_supported = DAT_QOS_BEST_EFFORT;
> provider_attr->completion_flags_supported =
> Index: ib/dapl_provider.c
> ===================================================================
> --- ib/dapl_provider.c (revision 3113)
> +++ ib/dapl_provider.c (working copy)
> @@ -49,8 +49,17 @@ MODULE_AUTHOR("James Lentini");
> #ifdef CONFIG_KDAPL_INFINIBAND_DEBUG
> static DAPL_DBG_MASK g_dapl_dbg_mask = 0;
> module_param_named(dbg_mask, g_dapl_dbg_mask, int, 0644);
> -MODULE_PARM_DESC(dbg_mask, "Bitmask to enable debug message types.");
> +MODULE_PARM_DESC(dbg_mask, "Bitmask to enable debug message types. <default=0>");
> #endif /* CONFIG_KDAPL_INFINIBAND_DEBUG */
> +unsigned int g_dapl_active_fmr = 1;
> +static unsigned int g_dapl_pool_size = 2048;
> +static unsigned int g_dapl_max_pages_per_fmr = 64;
> +module_param_named(active_fmr, g_dapl_active_fmr, int, 0644);
> +module_param_named(pool_size, g_dapl_pool_size, int, 0644);
> +module_param_named(max_pages_per_fmr, g_dapl_max_pages_per_fmr, int, 0644);
> +MODULE_PARM_DESC(active_fmr, "if active_fmr==1, creates fmr pool in pz_create <default=0>");
> +MODULE_PARM_DESC(pool_size, "num of fmr handles in pool <default=2048>");
> +MODULE_PARM_DESC(max_pages_per_fmr, "max size (in pages) of an fmr handle <default=64>");
>
> static LIST_HEAD(g_dapl_provider_list);
>
> @@ -152,6 +161,18 @@ void dapl_dbg_log(enum dapl_dbg_type typ
>
> #endif /* KDAPL_INFINIBAND_DEBUG */
>
> +void set_fmr_params (struct ib_fmr_pool_param *fmr_params)
> +{
> + fmr_params->max_pages_per_fmr = g_dapl_max_pages_per_fmr;
> + fmr_params->pool_size = g_dapl_pool_size;
> + fmr_params->dirty_watermark = 32;
> + fmr_params->cache = 1;
> + fmr_params->flush_function = NULL;
> + fmr_params->access = (IB_ACCESS_LOCAL_WRITE |
> + IB_ACCESS_REMOTE_WRITE |
> + IB_ACCESS_REMOTE_READ);
> +}
> +
> static struct dapl_provider *dapl_provider_alloc(const char *name,
> struct ib_device *device,
> u8 port)
> Index: ib/dapl_lmr.c
> ===================================================================
> --- ib/dapl_lmr.c (revision 3113)
> +++ ib/dapl_lmr.c (working copy)
> @@ -126,7 +126,7 @@ error1:
>
> static inline int dapl_lmr_create_physical(struct dapl_ia *ia,
> union dat_region_description phys_addr,
> - u64 page_count,
> + u64 length,
> enum dat_mem_type mem_type,
> struct dapl_pz *pz,
> enum dat_mem_priv_flags privileges,
> @@ -137,8 +137,10 @@ static inline int dapl_lmr_create_physic
> u64 *registered_address)
> {
> struct dapl_lmr *new_lmr;
> - int status;
> + int status = 0;
> + u32 page_count;
>
> + page_count = (u32)length;
> new_lmr = dapl_lmr_alloc(ia, mem_type, phys_addr,
> page_count, (struct dat_pz *) pz, privileges);
>
> @@ -149,15 +151,24 @@ static inline int dapl_lmr_create_physic
>
> if (DAT_MEM_TYPE_IA == mem_type) {
> status = dapl_ib_mr_register_ia(ia, new_lmr, phys_addr,
> - page_count, privileges);
> + length, privileges);
> }
> - else {
> + else if (DAT_MEM_TYPE_PHYSICAL == mem_type) {
> status = dapl_ib_mr_register_physical(ia, new_lmr,
> phys_addr.for_array,
> page_count, privileges);
> }
> + else if (DAT_MEM_TYPE_PLATFORM == mem_type) {
> + status = dapl_ib_mr_register_fmr(ia, new_lmr,
> + phys_addr.for_array,
> + page_count, privileges);
> + }
> + else {
> + status = -EINVAL;
> + goto error1;
> + }
>
> - if (0 != status)
> + if (status)
> goto error2;
>
> atomic_inc(&pz->pz_ref_count);
> @@ -258,6 +269,11 @@ int dapl_lmr_kcreate(struct dat_ia *ia,
> rmr_context, registered_length,
> registered_address);
> break;
> + case DAT_MEM_TYPE_PLATFORM: /* used as a proprietary Mellanox-FMR */
> + if (!g_dapl_active_fmr) {
> + status = -EINVAL;
> + break;
> + }
> case DAT_MEM_TYPE_PHYSICAL:
> case DAT_MEM_TYPE_IA:
> status = dapl_lmr_create_physical(dapl_ia, region_description,
> @@ -307,6 +323,7 @@ int dapl_lmr_free(struct dat_lmr *lmr)
>
> switch (dapl_lmr->param.mem_type) {
> case DAT_MEM_TYPE_PHYSICAL:
> + case DAT_MEM_TYPE_PLATFORM:
> case DAT_MEM_TYPE_IA:
> case DAT_MEM_TYPE_LMR:
> {
>
More information about the general
mailing list