[ofw] RE: [HW] memory allocation improvement in user space
Leonid Keller
leonid at mellanox.co.il
Tue Jul 1 03:34:46 PDT 2008
Applied in 1311.
> -----Original Message-----
> From: Leonid Keller
> Sent: Sunday, June 29, 2008 7:06 PM
> To: Leonid Keller; Fab Tillier; ofw at lists.openfabrics.org
> Subject: RE: [ofw] RE: [HW] memory allocation improvement in
> user space
>
> Resending ...
>
> > -----Original Message-----
> > From: Leonid Keller
> > Sent: Tuesday, June 24, 2008 1:47 PM
> > To: Leonid Keller; Fab Tillier; ofw at lists.openfabrics.org
> > Subject: RE: [ofw] RE: [HW] memory allocation improvement in user
> > space
> >
> > As an afterthought.
> >
> > I quote below only the user part of the patch as it was in
> the first
> > place.
> > Maybe you can notice some (inserted bug) of not-releasing
> or incorrect
> > work with memory, which caused the problems with
> > MmUnsecureVirtualMemory.
> >
> > Index: hw/mlx4/user/hca/buf.c
> > ===================================================================
> > --- hw/mlx4/user/hca/buf.c (revision 1294)
> > +++ hw/mlx4/user/hca/buf.c (working copy)
> > @@ -35,17 +35,13 @@
> > int mlx4_alloc_buf(struct mlx4_buf *buf, int size, int
> page_size) {
> > int ret;
> > -
> > ret = posix_memalign(&buf->buf, page_size, align(size,
> page_size));
> > - if (ret)
> > - return ret;
> > -
> > - buf->length = size;
> > -
> > - return 0;
> > + if (!ret)
> > + buf->length = size;
> > + return ret;
> > }
> >
> > void mlx4_free_buf(struct mlx4_buf *buf) {
> > - VirtualFree(buf->buf, 0, MEM_RELEASE);
> > + posix_memfree(buf->buf);
> > }
> > Index: hw/mlx4/user/hca/l2w.h
> > ===================================================================
> > --- hw/mlx4/user/hca/l2w.h (revision 1294)
> > +++ hw/mlx4/user/hca/l2w.h (working copy)
> > @@ -74,17 +74,49 @@
> > // FUNCTIONS
> > // ===========================================
> >
> > +static inline BOOLEAN is_power_of_2(uint32_t n) {
> > + return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
> > +
> > +// Allocated memory is zeroed !
> > static inline int posix_memalign(void **memptr, int alignment, int
> > size) {
> > - UNREFERENCED_PARAMETER(alignment);
> > + int aligned_size, desc_size = sizeof(int);
> > + char *real_addr, *aligned_addr;
> >
> > - *memptr = VirtualAlloc( NULL, size, MEM_COMMIT |
> > MEM_RESERVE, PAGE_READWRITE );
> > - if (*memptr)
> > - return 0;
> > - else
> > - return ENOMEM;
> > + // sanity check: alignment should a power of 2 and more then 2
> > + if ( alignment < desc_size ||
> > !is_power_of_2((uint32_t)alignment) )
> > + return -EINVAL;
> > +
> > + // calculate size, needed for aligned allocation
> > + aligned_size = size + alignment + desc_size;
> > +
> > + // allocate
> > + real_addr = cl_zalloc(aligned_size);
> > + if ( real_addr == NULL )
> > + return -ENOMEM;
> > +
> > + // calculate aligned address
> > + aligned_addr = (char *)(((ULONG_PTR)(real_addr +
> > alignment-1)) & ~(alignment - 1));
> > + if ( aligned_addr < real_addr + desc_size )
> > + aligned_addr += alignment;
> > +
> > + // store the descriptor
> > + *(int*)(aligned_addr - desc_size) = (int)(aligned_addr
> > - real_addr);
> > +
> > + *memptr = aligned_addr;
> > + return 0;
> > }
> >
> > +// there is no such POSIX function. Called so to be similar
> > to the allocation one.
> > +static inline void posix_memfree(void *memptr) {
> > + int *desc_addr = (int*)((char*)memptr - sizeof(int));
> > + char *real_addr = (char*)memptr - *desc_addr;
> > + cl_free(real_addr);
> > +}
> > +
> > static inline int ffsl(uint32_t x)
> > {
> > int r = 0;
> > Index: hw/mlx4/user/hca/qp.c
> > ===================================================================
> > --- hw/mlx4/user/hca/qp.c (revision 1294)
> > +++ hw/mlx4/user/hca/qp.c (working copy)
> > @@ -685,7 +685,6 @@
> > return -1;
> > }
> >
> > - memset(qp->buf.buf, 0, qp->buf_size);
> > mlx4_qp_init_sq_ownership(qp);
> >
> > return 0;
> > Index: hw/mlx4/user/hca/srq.c
> > ===================================================================
> > --- hw/mlx4/user/hca/srq.c (revision 1294)
> > +++ hw/mlx4/user/hca/srq.c (working copy)
> > @@ -146,8 +146,6 @@
> > return -1;
> > }
> >
> > - // srq->buf.buf is zeroed in posix_memalign -
> > memset(srq->buf.buf, 0, buf_size);
> > -
> > /*
> > * Now initialize the SRQ buffer so that all of the WQEs are
> > * linked into the list of free WQEs.
> > Index: hw/mlx4/user/hca/verbs.c
> > ===================================================================
> > --- hw/mlx4/user/hca/verbs.c (revision 1294)
> > +++ hw/mlx4/user/hca/verbs.c (working copy)
> > @@ -373,8 +373,6 @@
> > context->page_size))
> > goto err_alloc_buf;
> >
> > - // cq->buf.buf is zeroed in posix_memalign -
> > memset(cq->buf.buf, 0, buf_size);
> > -
> > cq->ibv_cq.context = context;
> > cq->cons_index = 0;
> >
> > @@ -718,7 +716,7 @@
> > attr.cap.max_recv_wr = p_create_attr->rq_depth;
> > attr.cap.max_send_sge = p_create_attr->sq_sge;
> > attr.cap.max_recv_sge = p_create_attr->rq_sge;
> > - attr.cap.max_inline_data =
> > p_create_attr->sq_max_inline; /* absent in IBAL */
> > + attr.cap.max_inline_data = p_create_attr->sq_max_inline;
> > attr.qp_type =
> > __to_qp_type(p_create_attr->qp_type);
> > attr.sq_sig_all =
> > p_create_attr->sq_signaled;
> >
> > Index: hw/mthca/user/mlnx_ual_srq.c
> > ===================================================================
> > --- hw/mthca/user/mlnx_ual_srq.c (revision 1294)
> > +++ hw/mthca/user/mlnx_ual_srq.c (working copy)
> > @@ -54,11 +54,7 @@
> > }
> >
> > if (srq->buf) {
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(srq->buf);
> > -#else
> > - VirtualFree( srq->buf, 0, MEM_RELEASE);
> > -#endif
> > + posix_memfree(srq->buf);
> > }
> >
> > if (srq->wrid)
> > @@ -158,11 +154,7 @@
> > goto end;
> >
> > err_alloc_db:
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(srq->buf);
> > -#else
> > - VirtualFree( srq->buf, 0, MEM_RELEASE);
> > -#endif
> > + posix_memfree(srq->buf);
> > cl_free(srq->wrid);
> > err_alloc_buf:
> > cl_spinlock_destroy(&srq->lock);
> > Index: hw/mthca/user/mlnx_uvp_memfree.c
> > ===================================================================
> > --- hw/mthca/user/mlnx_uvp_memfree.c (revision 1294)
> > +++ hw/mthca/user/mlnx_uvp_memfree.c (working copy)
> > @@ -201,11 +201,7 @@
> >
> > for (i = 0; i < db_tab->npages; ++i)
> > if (db_tab->page[i].db_rec)
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(db_tab->page[i].db_rec);
> > -#else
> > - VirtualFree( db_tab->page[i].db_rec, 0,
> > MEM_RELEASE);
> > -#endif
> > + posix_memfree( db_tab->page[i].db_rec);
> >
> > cl_free(db_tab);
> > }
> > Index: hw/mthca/user/mlnx_uvp_verbs.c
> > ===================================================================
> > --- hw/mthca/user/mlnx_uvp_verbs.c (revision 1294)
> > +++ hw/mthca/user/mlnx_uvp_verbs.c (working copy)
> > @@ -80,11 +80,7 @@
> > WaitForSingleObject( pd->ah_mutex, INFINITE );
> > for (page = pd->ah_list; page; page = next_page) {
> > next_page = page->next;
> > - #ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(page->buf);
> > - #else
> > - VirtualFree( page->buf, 0, MEM_RELEASE);
> > - #endif
> > + posix_memfree(page->buf);
> > cl_free(page);
> > }
> > ReleaseMutex( pd->ah_mutex );
> > @@ -181,7 +177,7 @@
> > cq->set_ci_db_index);
> >
> > err_unreg:
> > - cl_free(cq->buf);
> > + posix_memfree(cq->buf);
> >
> > err_memalign:
> > cl_spinlock_destroy(&cq->lock);
> > @@ -233,12 +229,7 @@
> > to_mcq(cq)->arm_db_index);
> > }
> >
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(to_mcq(cq)->buf);
> > -#else
> > - VirtualFree( to_mcq(cq)->buf, 0, MEM_RELEASE);
> > -#endif
> > -
> > + posix_memfree(to_mcq(cq)->buf);
> >
> > cl_spinlock_destroy(&((struct mthca_cq *)cq)->lock);
> > cl_free(to_mcq(cq));
> > @@ -380,11 +371,7 @@
> >
> > err_spinlock_sq:
> > cl_free(qp->wrid);
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(qp->buf);
> > -#else
> > - VirtualFree( qp->buf, 0, MEM_RELEASE);
> > -#endif
> > + posix_memfree(qp->buf);
> >
> > err_nomem:
> > cl_free(qp);
> > @@ -501,11 +488,7 @@
> > cl_spinlock_destroy(&((struct mthca_qp *)qp)->sq.lock);
> > cl_spinlock_destroy(&((struct mthca_qp *)qp)->rq.lock);
> >
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - cl_free(to_mqp(qp)->buf);
> > -#else
> > - VirtualFree( to_mqp(qp)->buf, 0, MEM_RELEASE);
> > -#endif
> > + posix_memfree(to_mqp(qp)->buf);
> > cl_free(to_mqp(qp)->wrid);
> > cl_free(to_mqp(qp));
> > }
> > Index: hw/mthca/user/mt_l2w.h
> > ===================================================================
> > --- hw/mthca/user/mt_l2w.h (revision 1294)
> > +++ hw/mthca/user/mt_l2w.h (working copy)
> > @@ -52,32 +52,49 @@
> >
> > extern size_t g_page_size;
> >
> > -static inline int posix_memalign(void **memptr, size_t alignment,
> > size_t size)
> > +static inline BOOLEAN is_power_of_2(uint32_t n)
> > {
> > -#ifdef NOT_USE_VIRTUAL_ALLOC
> > - // sanity checks
> > - if (alignment % sizeof(void*))
> > - return EINVAL;
> > - if (alignment < g_page_size) {
> > - fprintf(stderr, "mthca: Fatal (posix_memalign):
> > alignment too small - %d \n", alignment );
> > - return EINVAL;
> > - }
> > + return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
> >
> > - // allocation
> > - *memptr = cl_malloc(size);
> > - if (*memptr)
> > - return 0;
> > - else
> > - return ENOMEM;
> > -#else
> > - *memptr = VirtualAlloc( NULL, size, MEM_COMMIT |
> > MEM_RESERVE, PAGE_READWRITE );
> > - if (*memptr)
> > - return 0;
> > - else
> > - return ENOMEM;
> > -#endif
> > +// Allocated memory is zeroed !
> > +static inline int posix_memalign(void **memptr, int alignment, int
> > +size) {
> > + int aligned_size, desc_size = sizeof(int);
> > + char *real_addr, *aligned_addr;
> > +
> > + // sanity check: alignment should a power of 2 and more then 2
> > + if ( alignment < desc_size ||
> > !is_power_of_2((uint32_t)alignment) )
> > + return -EINVAL;
> > +
> > + // calculate size, needed for aligned allocation
> > + aligned_size = size + alignment + desc_size;
> > +
> > + // allocate
> > + real_addr = cl_zalloc(aligned_size);
> > + if ( real_addr == NULL )
> > + return -ENOMEM;
> > +
> > + // calculate aligned address
> > + aligned_addr = (char *)(((ULONG_PTR)(real_addr +
> > alignment-1)) & ~(alignment - 1));
> > + if ( aligned_addr < real_addr + desc_size )
> > + aligned_addr += alignment;
> > +
> > + // store the descriptor
> > + *(int*)(aligned_addr - desc_size) = (int)(aligned_addr
> > - real_addr);
> > +
> > + *memptr = aligned_addr;
> > + return 0;
> > }
> >
> > +// there is no such POSIX function. Called so to be similar
> > to the allocation one.
> > +static inline void posix_memfree(void *memptr) {
> > + int *desc_addr = (int*)((char*)memptr - sizeof(int));
> > + char *real_addr = (char*)memptr - *desc_addr;
> > + cl_free(real_addr);
> > +}
> > +
> > // ===========================================
> > // FUNCTIONS
> > // ===========================================
> >
> >
> >
> >
> > > -----Original Message-----
> > > From: ofw-bounces at lists.openfabrics.org
> > > [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of
> > Leonid Keller
> > > Sent: Tuesday, June 24, 2008 1:14 PM
> > > To: Fab Tillier; ofw at lists.openfabrics.org
> > > Subject: [ofw] RE: [HW] memory allocation improvement in
> user space
> > >
> > > See inline
> > >
> > > > -----Original Message-----
> > > > From: Fab Tillier [mailto:ftillier at windows.microsoft.com]
> > > > Sent: Monday, June 23, 2008 8:55 PM
> > > > To: Leonid Keller; ofw at lists.openfabrics.org
> > > > Subject: RE: [HW] memory allocation improvement in user space
> > > >
> > > > Hi Leo,
> > > >
> > > > >From: Leonid Keller
> > > > >Sent: Monday, June 23, 2008 6:44 AM
> > > > >
> > > > >Investigation of INSUFFICIENT_MEMORY failures on our
> > stress tests
> > > > >brought us to "revelation", that VirtualAlloc
> function, used for
> > > > >implementation of posix_memalign, is a very "greedy" one: it
> > > > allocates
> > > > >at least 64KB memory.
> > > > >As far as we usually ask one page, it is 16 times more than
> > > > necessary.
> > > >
> > > > VirtualAlloc doesn't actually allocate 64K of memory,
> but it does
> > > > reserve 64K of virtual address space. It then commits a
> > > single page.
> > > > This article:
> > > > http://msdn.microsoft.com/en-us/library/ms810627.aspx
> > > > explains this behavior: "The minimum size that can be
> > > reserved is 64K
> > > > ... Requesting one page of reserved addresses results in a
> > > 64K address
> > > > range" in the section about reserving memory.
> > >
> > > I know that. But using reservation and allocation
> > separately requires
> > > a complicate housekeeping system for one and we do not
> always need
> > > allocate integral pages for two.
> > > >
> > > > >Presented below a patch, which implements posix_memalign with
> > > > >(ultimately) HeapAlloc functions.
> > > > >The patch was tested and worked OK.
> > > > >
> > > > >An important nuance, that was revealed during testing is
> > > as follows:
> > > > >A system function, which releases the resources of an
> > > > exiting process,
> > > > >damages in some way the work of MmSecureVirtualMemory
> > > > function, which
> > > > >we use today to secure CQ\QP\SRQ circular buffers and
> > user buffers.
> > > > >If an application gets killed or exits without releasing the
> > > > resources,
> > > > >IBBUS catches this event, starts its cascading destroy of
> > > > resources and
> > > > >crashes on MmUnsecureVirtualMemory.
> > > > >Putting MmUnsecureVirtualMemory in try-except block
> > saves from the
> > > > >crash, but an async thread, releasing QPs, freezes on
> > > > >MmUnsecureVirtualMemory, which fails to get some mutex.
> > > >
> > > > Maybe the first exception didn't release the mutex?
> > >
> > > Maybe. It's not my mutex. It's theirs, internal one.
> > > The only change that I did (in user space) was to allocate
> > memory in
> > > the Heap and not as separate pages.
> > > MmSecureVirtualMemory/MmUnsecureVirtualMemory pair works on
> > the good
> > > flow, when the resources are released.
> > > MmUnsecureVirtualMemory fails with exception only when
> application
> > > exits without releasing the resources.
> > >
> > > >
> > > > >As far as there is no real reason to secure circular
> > > buffers, i've
> > > > >solved the problem by skipping securing for IB objects.
> > > > >User buffers are still secured while memory registration.
> > > > >
> > >
> > > BTW, DDK *doesn't* recommend to use this function.
> > > Its intention is to keep a user buffer from freeing or
> > changing access
> > > rights.
> > > We do not need this functionality for circular buffers anyway.
> > >
> > > > >Therefore the patch contains 3 kinds of changes:
> > > > >1) new implementation of posix_memalign and all
> related to that;
> > > > >2) try-except block around MmUnsecureVirtualMemory;
> > > >
> > > > I'd be weary of just working around the issue without
> > understanding
> > > > why it crashes, especially since you found some negative
> > > ramifications
> > > > to this (the subsequent hang). What if there's some memory
> > > corruption
> > > > problem in the UVP - shouldn't that be fixed rather than hidden?
> > > >
> > > > >3) new parameter in ib_umem_get and mthca_reg_virt_mr
> > for skipping
> > > > >memory securing and all related to it;
> > > >
> > > > -Fab
> > > >
> > >
> > >
> > > --
> > > Leonid Keller
> > > Mellanox Technologies LTD.
> > > SW- Windows
> > > Phone: +972 (4) 909 7200 (ext 372)
> > > Mobile: +972 (54) 464 7702
> > > E-mail: leonid at mellanox.co.il
> > >
> > >
> >
> ----------------------------------------------------------------------
> > > Emails belong on computers, trees belong in forests; if you
> > must print
> > > this, do it on recycled paper.
> > > http://www.greenpeace.org/international/
> > >
> >
> ----------------------------------------------------------------------
> > >
> > >
> > > Disclaimer added by CodeTwo Exchange Rules http://www.codetwo.com
> > > _______________________________________________
> > > ofw mailing list
> > > ofw at lists.openfabrics.org
> > > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> > >
More information about the ofw
mailing list