[ofw] RE: [HW] memory allocation improvement in user space
Leonid Keller
leonid at mellanox.co.il
Tue Jun 24 03:46:52 PDT 2008
As an afterthought.
I quote below only the user part of the patch as it was in the first
place.
Maybe you can notice some (inserted bug) of not-releasing or incorrect
work with memory, which caused the problems with
MmUnsecureVirtualMemory.
Index: hw/mlx4/user/hca/buf.c
===================================================================
--- hw/mlx4/user/hca/buf.c (revision 1294)
+++ hw/mlx4/user/hca/buf.c (working copy)
@@ -35,17 +35,13 @@
int mlx4_alloc_buf(struct mlx4_buf *buf, int size, int page_size)
{
int ret;
-
ret = posix_memalign(&buf->buf, page_size, align(size,
page_size));
- if (ret)
- return ret;
-
- buf->length = size;
-
- return 0;
+ if (!ret)
+ buf->length = size;
+ return ret;
}
void mlx4_free_buf(struct mlx4_buf *buf)
{
- VirtualFree(buf->buf, 0, MEM_RELEASE);
+ posix_memfree(buf->buf);
}
Index: hw/mlx4/user/hca/l2w.h
===================================================================
--- hw/mlx4/user/hca/l2w.h (revision 1294)
+++ hw/mlx4/user/hca/l2w.h (working copy)
@@ -74,17 +74,49 @@
// FUNCTIONS
// ===========================================
+static inline BOOLEAN is_power_of_2(uint32_t n)
+{
+ return (!!n & !(n & (n-1))) ? TRUE : FALSE;
+}
+
+// Allocated memory is zeroed !
static inline int posix_memalign(void **memptr, int alignment, int
size)
{
- UNREFERENCED_PARAMETER(alignment);
+ int aligned_size, desc_size = sizeof(int);
+ char *real_addr, *aligned_addr;
- *memptr = VirtualAlloc( NULL, size, MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE );
- if (*memptr)
- return 0;
- else
- return ENOMEM;
+ // sanity check: alignment should a power of 2 and more then 2
+ if ( alignment < desc_size ||
!is_power_of_2((uint32_t)alignment) )
+ return -EINVAL;
+
+ // calculate size, needed for aligned allocation
+ aligned_size = size + alignment + desc_size;
+
+ // allocate
+ real_addr = cl_zalloc(aligned_size);
+ if ( real_addr == NULL )
+ return -ENOMEM;
+
+ // calculate aligned address
+ aligned_addr = (char *)(((ULONG_PTR)(real_addr + alignment-1)) &
~(alignment - 1));
+ if ( aligned_addr < real_addr + desc_size )
+ aligned_addr += alignment;
+
+ // store the descriptor
+ *(int*)(aligned_addr - desc_size) = (int)(aligned_addr -
real_addr);
+
+ *memptr = aligned_addr;
+ return 0;
}
+// there is no such POSIX function. Called so to be similar to the
allocation one.
+static inline void posix_memfree(void *memptr)
+{
+ int *desc_addr = (int*)((char*)memptr - sizeof(int));
+ char *real_addr = (char*)memptr - *desc_addr;
+ cl_free(real_addr);
+}
+
static inline int ffsl(uint32_t x)
{
int r = 0;
Index: hw/mlx4/user/hca/qp.c
===================================================================
--- hw/mlx4/user/hca/qp.c (revision 1294)
+++ hw/mlx4/user/hca/qp.c (working copy)
@@ -685,7 +685,6 @@
return -1;
}
- memset(qp->buf.buf, 0, qp->buf_size);
mlx4_qp_init_sq_ownership(qp);
return 0;
Index: hw/mlx4/user/hca/srq.c
===================================================================
--- hw/mlx4/user/hca/srq.c (revision 1294)
+++ hw/mlx4/user/hca/srq.c (working copy)
@@ -146,8 +146,6 @@
return -1;
}
- // srq->buf.buf is zeroed in posix_memalign -
memset(srq->buf.buf, 0, buf_size);
-
/*
* Now initialize the SRQ buffer so that all of the WQEs are
* linked into the list of free WQEs.
Index: hw/mlx4/user/hca/verbs.c
===================================================================
--- hw/mlx4/user/hca/verbs.c (revision 1294)
+++ hw/mlx4/user/hca/verbs.c (working copy)
@@ -373,8 +373,6 @@
context->page_size))
goto err_alloc_buf;
- // cq->buf.buf is zeroed in posix_memalign - memset(cq->buf.buf,
0, buf_size);
-
cq->ibv_cq.context = context;
cq->cons_index = 0;
@@ -718,7 +716,7 @@
attr.cap.max_recv_wr = p_create_attr->rq_depth;
attr.cap.max_send_sge = p_create_attr->sq_sge;
attr.cap.max_recv_sge = p_create_attr->rq_sge;
- attr.cap.max_inline_data = p_create_attr->sq_max_inline;
/* absent in IBAL */
+ attr.cap.max_inline_data = p_create_attr->sq_max_inline;
attr.qp_type =
__to_qp_type(p_create_attr->qp_type);
attr.sq_sig_all =
p_create_attr->sq_signaled;
Index: hw/mthca/user/mlnx_ual_srq.c
===================================================================
--- hw/mthca/user/mlnx_ual_srq.c (revision 1294)
+++ hw/mthca/user/mlnx_ual_srq.c (working copy)
@@ -54,11 +54,7 @@
}
if (srq->buf) {
-#ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(srq->buf);
-#else
- VirtualFree( srq->buf, 0, MEM_RELEASE);
-#endif
+ posix_memfree(srq->buf);
}
if (srq->wrid)
@@ -158,11 +154,7 @@
goto end;
err_alloc_db:
-#ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(srq->buf);
-#else
- VirtualFree( srq->buf, 0, MEM_RELEASE);
-#endif
+ posix_memfree(srq->buf);
cl_free(srq->wrid);
err_alloc_buf:
cl_spinlock_destroy(&srq->lock);
Index: hw/mthca/user/mlnx_uvp_memfree.c
===================================================================
--- hw/mthca/user/mlnx_uvp_memfree.c (revision 1294)
+++ hw/mthca/user/mlnx_uvp_memfree.c (working copy)
@@ -201,11 +201,7 @@
for (i = 0; i < db_tab->npages; ++i)
if (db_tab->page[i].db_rec)
-#ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(db_tab->page[i].db_rec);
-#else
- VirtualFree( db_tab->page[i].db_rec, 0,
MEM_RELEASE);
-#endif
+ posix_memfree( db_tab->page[i].db_rec);
cl_free(db_tab);
}
Index: hw/mthca/user/mlnx_uvp_verbs.c
===================================================================
--- hw/mthca/user/mlnx_uvp_verbs.c (revision 1294)
+++ hw/mthca/user/mlnx_uvp_verbs.c (working copy)
@@ -80,11 +80,7 @@
WaitForSingleObject( pd->ah_mutex, INFINITE );
for (page = pd->ah_list; page; page = next_page) {
next_page = page->next;
- #ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(page->buf);
- #else
- VirtualFree( page->buf, 0, MEM_RELEASE);
- #endif
+ posix_memfree(page->buf);
cl_free(page);
}
ReleaseMutex( pd->ah_mutex );
@@ -181,7 +177,7 @@
cq->set_ci_db_index);
err_unreg:
- cl_free(cq->buf);
+ posix_memfree(cq->buf);
err_memalign:
cl_spinlock_destroy(&cq->lock);
@@ -233,12 +229,7 @@
to_mcq(cq)->arm_db_index);
}
-#ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(to_mcq(cq)->buf);
-#else
- VirtualFree( to_mcq(cq)->buf, 0, MEM_RELEASE);
-#endif
-
+ posix_memfree(to_mcq(cq)->buf);
cl_spinlock_destroy(&((struct mthca_cq *)cq)->lock);
cl_free(to_mcq(cq));
@@ -380,11 +371,7 @@
err_spinlock_sq:
cl_free(qp->wrid);
-#ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(qp->buf);
-#else
- VirtualFree( qp->buf, 0, MEM_RELEASE);
-#endif
+ posix_memfree(qp->buf);
err_nomem:
cl_free(qp);
@@ -501,11 +488,7 @@
cl_spinlock_destroy(&((struct mthca_qp *)qp)->sq.lock);
cl_spinlock_destroy(&((struct mthca_qp *)qp)->rq.lock);
-#ifdef NOT_USE_VIRTUAL_ALLOC
- cl_free(to_mqp(qp)->buf);
-#else
- VirtualFree( to_mqp(qp)->buf, 0, MEM_RELEASE);
-#endif
+ posix_memfree(to_mqp(qp)->buf);
cl_free(to_mqp(qp)->wrid);
cl_free(to_mqp(qp));
}
Index: hw/mthca/user/mt_l2w.h
===================================================================
--- hw/mthca/user/mt_l2w.h (revision 1294)
+++ hw/mthca/user/mt_l2w.h (working copy)
@@ -52,32 +52,49 @@
extern size_t g_page_size;
-static inline int posix_memalign(void **memptr, size_t alignment,
size_t size)
+static inline BOOLEAN is_power_of_2(uint32_t n)
{
-#ifdef NOT_USE_VIRTUAL_ALLOC
- // sanity checks
- if (alignment % sizeof(void*))
- return EINVAL;
- if (alignment < g_page_size) {
- fprintf(stderr, "mthca: Fatal (posix_memalign):
alignment too small - %d \n", alignment );
- return EINVAL;
- }
+ return (!!n & !(n & (n-1))) ? TRUE : FALSE;
+}
- // allocation
- *memptr = cl_malloc(size);
- if (*memptr)
- return 0;
- else
- return ENOMEM;
-#else
- *memptr = VirtualAlloc( NULL, size, MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE );
- if (*memptr)
- return 0;
- else
- return ENOMEM;
-#endif
+// Allocated memory is zeroed !
+static inline int posix_memalign(void **memptr, int alignment, int
size)
+{
+ int aligned_size, desc_size = sizeof(int);
+ char *real_addr, *aligned_addr;
+
+ // sanity check: alignment should a power of 2 and more then 2
+ if ( alignment < desc_size ||
!is_power_of_2((uint32_t)alignment) )
+ return -EINVAL;
+
+ // calculate size, needed for aligned allocation
+ aligned_size = size + alignment + desc_size;
+
+ // allocate
+ real_addr = cl_zalloc(aligned_size);
+ if ( real_addr == NULL )
+ return -ENOMEM;
+
+ // calculate aligned address
+ aligned_addr = (char *)(((ULONG_PTR)(real_addr + alignment-1)) &
~(alignment - 1));
+ if ( aligned_addr < real_addr + desc_size )
+ aligned_addr += alignment;
+
+ // store the descriptor
+ *(int*)(aligned_addr - desc_size) = (int)(aligned_addr -
real_addr);
+
+ *memptr = aligned_addr;
+ return 0;
}
+// there is no such POSIX function. Called so to be similar to the
allocation one.
+static inline void posix_memfree(void *memptr)
+{
+ int *desc_addr = (int*)((char*)memptr - sizeof(int));
+ char *real_addr = (char*)memptr - *desc_addr;
+ cl_free(real_addr);
+}
+
// ===========================================
// FUNCTIONS
// ===========================================
> -----Original Message-----
> From: ofw-bounces at lists.openfabrics.org
> [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Leonid Keller
> Sent: Tuesday, June 24, 2008 1:14 PM
> To: Fab Tillier; ofw at lists.openfabrics.org
> Subject: [ofw] RE: [HW] memory allocation improvement in user space
>
> See inline
>
> > -----Original Message-----
> > From: Fab Tillier [mailto:ftillier at windows.microsoft.com]
> > Sent: Monday, June 23, 2008 8:55 PM
> > To: Leonid Keller; ofw at lists.openfabrics.org
> > Subject: RE: [HW] memory allocation improvement in user space
> >
> > Hi Leo,
> >
> > >From: Leonid Keller
> > >Sent: Monday, June 23, 2008 6:44 AM
> > >
> > >Investigation of INSUFFICIENT_MEMORY failures on our stress tests
> > >brought us to "revelation", that VirtualAlloc function, used for
> > >implementation of posix_memalign, is a very "greedy" one: it
> > allocates
> > >at least 64KB memory.
> > >As far as we usually ask one page, it is 16 times more than
> > necessary.
> >
> > VirtualAlloc doesn't actually allocate 64K of memory, but it does
> > reserve 64K of virtual address space. It then commits a
> single page.
> > This article:
> > http://msdn.microsoft.com/en-us/library/ms810627.aspx
> > explains this behavior: "The minimum size that can be
> reserved is 64K
> > ... Requesting one page of reserved addresses results in a
> 64K address
> > range" in the section about reserving memory.
>
> I know that. But using reservation and allocation separately
> requires a complicate housekeeping system for one and we do
> not always need allocate integral pages for two.
> >
> > >Presented below a patch, which implements posix_memalign with
> > >(ultimately) HeapAlloc functions.
> > >The patch was tested and worked OK.
> > >
> > >An important nuance, that was revealed during testing is
> as follows:
> > >A system function, which releases the resources of an
> > exiting process,
> > >damages in some way the work of MmSecureVirtualMemory
> > function, which
> > >we use today to secure CQ\QP\SRQ circular buffers and user buffers.
> > >If an application gets killed or exits without releasing the
> > resources,
> > >IBBUS catches this event, starts its cascading destroy of
> > resources and
> > >crashes on MmUnsecureVirtualMemory.
> > >Putting MmUnsecureVirtualMemory in try-except block saves from the
> > >crash, but an async thread, releasing QPs, freezes on
> > >MmUnsecureVirtualMemory, which fails to get some mutex.
> >
> > Maybe the first exception didn't release the mutex?
>
> Maybe. It's not my mutex. It's theirs, internal one.
> The only change that I did (in user space) was to allocate
> memory in the Heap and not as separate pages.
> MmSecureVirtualMemory/MmUnsecureVirtualMemory pair works on
> the good flow, when the resources are released.
> MmUnsecureVirtualMemory fails with exception only when
> application exits without releasing the resources.
>
> >
> > >As far as there is no real reason to secure circular
> buffers, i've
> > >solved the problem by skipping securing for IB objects.
> > >User buffers are still secured while memory registration.
> > >
>
> BTW, DDK *doesn't* recommend to use this function.
> Its intention is to keep a user buffer from freeing or
> changing access rights.
> We do not need this functionality for circular buffers anyway.
>
> > >Therefore the patch contains 3 kinds of changes:
> > >1) new implementation of posix_memalign and all related to that;
> > >2) try-except block around MmUnsecureVirtualMemory;
> >
> > I'd be weary of just working around the issue without understanding
> > why it crashes, especially since you found some negative
> ramifications
> > to this (the subsequent hang). What if there's some memory
> corruption
> > problem in the UVP - shouldn't that be fixed rather than hidden?
> >
> > >3) new parameter in ib_umem_get and mthca_reg_virt_mr for skipping
> > >memory securing and all related to it;
> >
> > -Fab
> >
>
>
> --
> Leonid Keller
> Mellanox Technologies LTD.
> SW- Windows
> Phone: +972 (4) 909 7200 (ext 372)
> Mobile: +972 (54) 464 7702
> E-mail: leonid at mellanox.co.il
>
> ----------------------------------------------------------------------
> Emails belong on computers, trees belong in forests; if you
> must print this, do it on recycled paper.
> http://www.greenpeace.org/international/
> ----------------------------------------------------------------------
>
>
> Disclaimer added by CodeTwo Exchange Rules
> http://www.codetwo.com _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>
--
Leonid Keller
Mellanox Technologies LTD.
SW- Windows
Phone: +972 (4) 909 7200 (ext 372)
Mobile: +972 (54) 464 7702
E-mail: leonid at mellanox.co.il
----------------------------------------------------------------------
Emails belong on computers, trees belong in forests; if you must print
this, do it on recycled paper.
http://www.greenpeace.org/international/
----------------------------------------------------------------------
Disclaimer added by CodeTwo Exchange Rules
http://www.codetwo.com
-------------- next part --------------
A non-text attachment was scrubbed...
Name: improve_memory_user.patch
Type: application/octet-stream
Size: 8927 bytes
Desc: improve_memory_user.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080624/d26854ad/attachment.obj>
More information about the ofw
mailing list