[ofw] RE: [HW] memory allocation improvement in user space
Leonid Keller
leonid at mellanox.co.il
Sun Jun 29 09:05:50 PDT 2008
Resending ...
> -----Original Message-----
> From: Leonid Keller
> Sent: Tuesday, June 24, 2008 1:47 PM
> To: Leonid Keller; Fab Tillier; ofw at lists.openfabrics.org
> Subject: RE: [ofw] RE: [HW] memory allocation improvement in
> user space
>
> As an afterthought.
>
> I quote below only the user part of the patch as it was in
> the first place.
> Maybe you can notice some (inserted bug) of not-releasing or
> incorrect work with memory, which caused the problems with
> MmUnsecureVirtualMemory.
>
> Index: hw/mlx4/user/hca/buf.c
> ===================================================================
> --- hw/mlx4/user/hca/buf.c (revision 1294)
> +++ hw/mlx4/user/hca/buf.c (working copy)
> @@ -35,17 +35,13 @@
> int mlx4_alloc_buf(struct mlx4_buf *buf, int size, int page_size) {
> int ret;
> -
> ret = posix_memalign(&buf->buf, page_size, align(size,
> page_size));
> - if (ret)
> - return ret;
> -
> - buf->length = size;
> -
> - return 0;
> + if (!ret)
> + buf->length = size;
> + return ret;
> }
>
> void mlx4_free_buf(struct mlx4_buf *buf) {
> - VirtualFree(buf->buf, 0, MEM_RELEASE);
> + posix_memfree(buf->buf);
> }
> Index: hw/mlx4/user/hca/l2w.h
> ===================================================================
> --- hw/mlx4/user/hca/l2w.h (revision 1294)
> +++ hw/mlx4/user/hca/l2w.h (working copy)
> @@ -74,17 +74,49 @@
> // FUNCTIONS
> // ===========================================
>
> +static inline BOOLEAN is_power_of_2(uint32_t n) {
> + return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
> +
> +// Allocated memory is zeroed !
> static inline int posix_memalign(void **memptr, int
> alignment, int size) {
> - UNREFERENCED_PARAMETER(alignment);
> + int aligned_size, desc_size = sizeof(int);
> + char *real_addr, *aligned_addr;
>
> - *memptr = VirtualAlloc( NULL, size, MEM_COMMIT |
> MEM_RESERVE, PAGE_READWRITE );
> - if (*memptr)
> - return 0;
> - else
> - return ENOMEM;
> + // sanity check: alignment should a power of 2 and more then 2
> + if ( alignment < desc_size ||
> !is_power_of_2((uint32_t)alignment) )
> + return -EINVAL;
> +
> + // calculate size, needed for aligned allocation
> + aligned_size = size + alignment + desc_size;
> +
> + // allocate
> + real_addr = cl_zalloc(aligned_size);
> + if ( real_addr == NULL )
> + return -ENOMEM;
> +
> + // calculate aligned address
> + aligned_addr = (char *)(((ULONG_PTR)(real_addr +
> alignment-1)) & ~(alignment - 1));
> + if ( aligned_addr < real_addr + desc_size )
> + aligned_addr += alignment;
> +
> + // store the descriptor
> + *(int*)(aligned_addr - desc_size) = (int)(aligned_addr
> - real_addr);
> +
> + *memptr = aligned_addr;
> + return 0;
> }
>
> +// there is no such POSIX function. Called so to be similar
> to the allocation one.
> +static inline void posix_memfree(void *memptr) {
> + int *desc_addr = (int*)((char*)memptr - sizeof(int));
> + char *real_addr = (char*)memptr - *desc_addr;
> + cl_free(real_addr);
> +}
> +
> static inline int ffsl(uint32_t x)
> {
> int r = 0;
> Index: hw/mlx4/user/hca/qp.c
> ===================================================================
> --- hw/mlx4/user/hca/qp.c (revision 1294)
> +++ hw/mlx4/user/hca/qp.c (working copy)
> @@ -685,7 +685,6 @@
> return -1;
> }
>
> - memset(qp->buf.buf, 0, qp->buf_size);
> mlx4_qp_init_sq_ownership(qp);
>
> return 0;
> Index: hw/mlx4/user/hca/srq.c
> ===================================================================
> --- hw/mlx4/user/hca/srq.c (revision 1294)
> +++ hw/mlx4/user/hca/srq.c (working copy)
> @@ -146,8 +146,6 @@
> return -1;
> }
>
> - // srq->buf.buf is zeroed in posix_memalign -
> memset(srq->buf.buf, 0, buf_size);
> -
> /*
> * Now initialize the SRQ buffer so that all of the WQEs are
> * linked into the list of free WQEs.
> Index: hw/mlx4/user/hca/verbs.c
> ===================================================================
> --- hw/mlx4/user/hca/verbs.c (revision 1294)
> +++ hw/mlx4/user/hca/verbs.c (working copy)
> @@ -373,8 +373,6 @@
> context->page_size))
> goto err_alloc_buf;
>
> - // cq->buf.buf is zeroed in posix_memalign -
> memset(cq->buf.buf, 0, buf_size);
> -
> cq->ibv_cq.context = context;
> cq->cons_index = 0;
>
> @@ -718,7 +716,7 @@
> attr.cap.max_recv_wr = p_create_attr->rq_depth;
> attr.cap.max_send_sge = p_create_attr->sq_sge;
> attr.cap.max_recv_sge = p_create_attr->rq_sge;
> - attr.cap.max_inline_data =
> p_create_attr->sq_max_inline; /* absent in IBAL */
> + attr.cap.max_inline_data = p_create_attr->sq_max_inline;
> attr.qp_type =
> __to_qp_type(p_create_attr->qp_type);
> attr.sq_sig_all =
> p_create_attr->sq_signaled;
>
> Index: hw/mthca/user/mlnx_ual_srq.c
> ===================================================================
> --- hw/mthca/user/mlnx_ual_srq.c (revision 1294)
> +++ hw/mthca/user/mlnx_ual_srq.c (working copy)
> @@ -54,11 +54,7 @@
> }
>
> if (srq->buf) {
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(srq->buf);
> -#else
> - VirtualFree( srq->buf, 0, MEM_RELEASE);
> -#endif
> + posix_memfree(srq->buf);
> }
>
> if (srq->wrid)
> @@ -158,11 +154,7 @@
> goto end;
>
> err_alloc_db:
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(srq->buf);
> -#else
> - VirtualFree( srq->buf, 0, MEM_RELEASE);
> -#endif
> + posix_memfree(srq->buf);
> cl_free(srq->wrid);
> err_alloc_buf:
> cl_spinlock_destroy(&srq->lock);
> Index: hw/mthca/user/mlnx_uvp_memfree.c
> ===================================================================
> --- hw/mthca/user/mlnx_uvp_memfree.c (revision 1294)
> +++ hw/mthca/user/mlnx_uvp_memfree.c (working copy)
> @@ -201,11 +201,7 @@
>
> for (i = 0; i < db_tab->npages; ++i)
> if (db_tab->page[i].db_rec)
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(db_tab->page[i].db_rec);
> -#else
> - VirtualFree( db_tab->page[i].db_rec, 0,
> MEM_RELEASE);
> -#endif
> + posix_memfree( db_tab->page[i].db_rec);
>
> cl_free(db_tab);
> }
> Index: hw/mthca/user/mlnx_uvp_verbs.c
> ===================================================================
> --- hw/mthca/user/mlnx_uvp_verbs.c (revision 1294)
> +++ hw/mthca/user/mlnx_uvp_verbs.c (working copy)
> @@ -80,11 +80,7 @@
> WaitForSingleObject( pd->ah_mutex, INFINITE );
> for (page = pd->ah_list; page; page = next_page) {
> next_page = page->next;
> - #ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(page->buf);
> - #else
> - VirtualFree( page->buf, 0, MEM_RELEASE);
> - #endif
> + posix_memfree(page->buf);
> cl_free(page);
> }
> ReleaseMutex( pd->ah_mutex );
> @@ -181,7 +177,7 @@
> cq->set_ci_db_index);
>
> err_unreg:
> - cl_free(cq->buf);
> + posix_memfree(cq->buf);
>
> err_memalign:
> cl_spinlock_destroy(&cq->lock);
> @@ -233,12 +229,7 @@
> to_mcq(cq)->arm_db_index);
> }
>
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(to_mcq(cq)->buf);
> -#else
> - VirtualFree( to_mcq(cq)->buf, 0, MEM_RELEASE);
> -#endif
> -
> + posix_memfree(to_mcq(cq)->buf);
>
> cl_spinlock_destroy(&((struct mthca_cq *)cq)->lock);
> cl_free(to_mcq(cq));
> @@ -380,11 +371,7 @@
>
> err_spinlock_sq:
> cl_free(qp->wrid);
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(qp->buf);
> -#else
> - VirtualFree( qp->buf, 0, MEM_RELEASE);
> -#endif
> + posix_memfree(qp->buf);
>
> err_nomem:
> cl_free(qp);
> @@ -501,11 +488,7 @@
> cl_spinlock_destroy(&((struct mthca_qp *)qp)->sq.lock);
> cl_spinlock_destroy(&((struct mthca_qp *)qp)->rq.lock);
>
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - cl_free(to_mqp(qp)->buf);
> -#else
> - VirtualFree( to_mqp(qp)->buf, 0, MEM_RELEASE);
> -#endif
> + posix_memfree(to_mqp(qp)->buf);
> cl_free(to_mqp(qp)->wrid);
> cl_free(to_mqp(qp));
> }
> Index: hw/mthca/user/mt_l2w.h
> ===================================================================
> --- hw/mthca/user/mt_l2w.h (revision 1294)
> +++ hw/mthca/user/mt_l2w.h (working copy)
> @@ -52,32 +52,49 @@
>
> extern size_t g_page_size;
>
> -static inline int posix_memalign(void **memptr, size_t
> alignment, size_t size)
> +static inline BOOLEAN is_power_of_2(uint32_t n)
> {
> -#ifdef NOT_USE_VIRTUAL_ALLOC
> - // sanity checks
> - if (alignment % sizeof(void*))
> - return EINVAL;
> - if (alignment < g_page_size) {
> - fprintf(stderr, "mthca: Fatal (posix_memalign):
> alignment too small - %d \n", alignment );
> - return EINVAL;
> - }
> + return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
>
> - // allocation
> - *memptr = cl_malloc(size);
> - if (*memptr)
> - return 0;
> - else
> - return ENOMEM;
> -#else
> - *memptr = VirtualAlloc( NULL, size, MEM_COMMIT |
> MEM_RESERVE, PAGE_READWRITE );
> - if (*memptr)
> - return 0;
> - else
> - return ENOMEM;
> -#endif
> +// Allocated memory is zeroed !
> +static inline int posix_memalign(void **memptr, int alignment, int
> +size) {
> + int aligned_size, desc_size = sizeof(int);
> + char *real_addr, *aligned_addr;
> +
> + // sanity check: alignment should a power of 2 and more then 2
> + if ( alignment < desc_size ||
> !is_power_of_2((uint32_t)alignment) )
> + return -EINVAL;
> +
> + // calculate size, needed for aligned allocation
> + aligned_size = size + alignment + desc_size;
> +
> + // allocate
> + real_addr = cl_zalloc(aligned_size);
> + if ( real_addr == NULL )
> + return -ENOMEM;
> +
> + // calculate aligned address
> + aligned_addr = (char *)(((ULONG_PTR)(real_addr +
> alignment-1)) & ~(alignment - 1));
> + if ( aligned_addr < real_addr + desc_size )
> + aligned_addr += alignment;
> +
> + // store the descriptor
> + *(int*)(aligned_addr - desc_size) = (int)(aligned_addr
> - real_addr);
> +
> + *memptr = aligned_addr;
> + return 0;
> }
>
> +// there is no such POSIX function. Called so to be similar
> to the allocation one.
> +static inline void posix_memfree(void *memptr) {
> + int *desc_addr = (int*)((char*)memptr - sizeof(int));
> + char *real_addr = (char*)memptr - *desc_addr;
> + cl_free(real_addr);
> +}
> +
> // ===========================================
> // FUNCTIONS
> // ===========================================
>
>
>
>
> > -----Original Message-----
> > From: ofw-bounces at lists.openfabrics.org
> > [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of
> Leonid Keller
> > Sent: Tuesday, June 24, 2008 1:14 PM
> > To: Fab Tillier; ofw at lists.openfabrics.org
> > Subject: [ofw] RE: [HW] memory allocation improvement in user space
> >
> > See inline
> >
> > > -----Original Message-----
> > > From: Fab Tillier [mailto:ftillier at windows.microsoft.com]
> > > Sent: Monday, June 23, 2008 8:55 PM
> > > To: Leonid Keller; ofw at lists.openfabrics.org
> > > Subject: RE: [HW] memory allocation improvement in user space
> > >
> > > Hi Leo,
> > >
> > > >From: Leonid Keller
> > > >Sent: Monday, June 23, 2008 6:44 AM
> > > >
> > > >Investigation of INSUFFICIENT_MEMORY failures on our
> stress tests
> > > >brought us to "revelation", that VirtualAlloc function, used for
> > > >implementation of posix_memalign, is a very "greedy" one: it
> > > allocates
> > > >at least 64KB memory.
> > > >As far as we usually ask one page, it is 16 times more than
> > > necessary.
> > >
> > > VirtualAlloc doesn't actually allocate 64K of memory, but it does
> > > reserve 64K of virtual address space. It then commits a
> > single page.
> > > This article:
> > > http://msdn.microsoft.com/en-us/library/ms810627.aspx
> > > explains this behavior: "The minimum size that can be
> > reserved is 64K
> > > ... Requesting one page of reserved addresses results in a
> > 64K address
> > > range" in the section about reserving memory.
> >
> > I know that. But using reservation and allocation
> separately requires
> > a complicate housekeeping system for one and we do not always need
> > allocate integral pages for two.
> > >
> > > >Presented below a patch, which implements posix_memalign with
> > > >(ultimately) HeapAlloc functions.
> > > >The patch was tested and worked OK.
> > > >
> > > >An important nuance, that was revealed during testing is
> > as follows:
> > > >A system function, which releases the resources of an
> > > exiting process,
> > > >damages in some way the work of MmSecureVirtualMemory
> > > function, which
> > > >we use today to secure CQ\QP\SRQ circular buffers and
> user buffers.
> > > >If an application gets killed or exits without releasing the
> > > resources,
> > > >IBBUS catches this event, starts its cascading destroy of
> > > resources and
> > > >crashes on MmUnsecureVirtualMemory.
> > > >Putting MmUnsecureVirtualMemory in try-except block
> saves from the
> > > >crash, but an async thread, releasing QPs, freezes on
> > > >MmUnsecureVirtualMemory, which fails to get some mutex.
> > >
> > > Maybe the first exception didn't release the mutex?
> >
> > Maybe. It's not my mutex. It's theirs, internal one.
> > The only change that I did (in user space) was to allocate
> memory in
> > the Heap and not as separate pages.
> > MmSecureVirtualMemory/MmUnsecureVirtualMemory pair works on
> the good
> > flow, when the resources are released.
> > MmUnsecureVirtualMemory fails with exception only when application
> > exits without releasing the resources.
> >
> > >
> > > >As far as there is no real reason to secure circular
> > buffers, i've
> > > >solved the problem by skipping securing for IB objects.
> > > >User buffers are still secured while memory registration.
> > > >
> >
> > BTW, DDK *doesn't* recommend to use this function.
> > Its intention is to keep a user buffer from freeing or
> changing access
> > rights.
> > We do not need this functionality for circular buffers anyway.
> >
> > > >Therefore the patch contains 3 kinds of changes:
> > > >1) new implementation of posix_memalign and all related to that;
> > > >2) try-except block around MmUnsecureVirtualMemory;
> > >
> > > I'd be weary of just working around the issue without
> understanding
> > > why it crashes, especially since you found some negative
> > ramifications
> > > to this (the subsequent hang). What if there's some memory
> > corruption
> > > problem in the UVP - shouldn't that be fixed rather than hidden?
> > >
> > > >3) new parameter in ib_umem_get and mthca_reg_virt_mr
> for skipping
> > > >memory securing and all related to it;
> > >
> > > -Fab
> > >
> >
> >
> > --
> > Leonid Keller
> > Mellanox Technologies LTD.
> > SW- Windows
> > Phone: +972 (4) 909 7200 (ext 372)
> > Mobile: +972 (54) 464 7702
> > E-mail: leonid at mellanox.co.il
> >
> >
> ----------------------------------------------------------------------
> > Emails belong on computers, trees belong in forests; if you
> must print
> > this, do it on recycled paper.
> > http://www.greenpeace.org/international/
> >
> ----------------------------------------------------------------------
> >
> >
> > Disclaimer added by CodeTwo Exchange Rules http://www.codetwo.com
> > _______________________________________________
> > ofw mailing list
> > ofw at lists.openfabrics.org
> > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> >
More information about the ofw
mailing list