[ofw] RE: [HW] memory allocation improvement in user space

Leonid Keller leonid at mellanox.co.il
Tue Jun 24 03:46:52 PDT 2008


As an afterthought.

I quote below only the user part of the patch as it was in the first
place.
Maybe you can notice some (inserted bug) of not-releasing or incorrect
work with memory, which caused the problems with
MmUnsecureVirtualMemory.

Index: hw/mlx4/user/hca/buf.c
===================================================================
--- hw/mlx4/user/hca/buf.c	(revision 1294)
+++ hw/mlx4/user/hca/buf.c	(working copy)
@@ -35,17 +35,13 @@
 int mlx4_alloc_buf(struct mlx4_buf *buf, int size, int page_size)
 {
 	int ret;
-
 	ret = posix_memalign(&buf->buf, page_size, align(size,
page_size));
-	if (ret)
-		return ret;
-
-	buf->length = size;
-
-	return 0;
+	if (!ret)
+		buf->length = size;
+	return ret;
 }
 
 void mlx4_free_buf(struct mlx4_buf *buf)
 {
-	VirtualFree(buf->buf, 0, MEM_RELEASE);
+	posix_memfree(buf->buf);
 }
Index: hw/mlx4/user/hca/l2w.h
===================================================================
--- hw/mlx4/user/hca/l2w.h	(revision 1294)
+++ hw/mlx4/user/hca/l2w.h	(working copy)
@@ -74,17 +74,49 @@
 // FUNCTIONS
 // ===========================================
 
+static inline BOOLEAN is_power_of_2(uint32_t n)
+{
+	return (!!n & !(n & (n-1))) ? TRUE : FALSE;
+}
+
+// Allocated memory is zeroed !
 static inline int posix_memalign(void **memptr, int alignment, int
size)
 {
-	UNREFERENCED_PARAMETER(alignment);
+	int aligned_size, desc_size = sizeof(int);
+	char *real_addr, *aligned_addr;
 
-	*memptr = VirtualAlloc( NULL, size, MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE );
-	if (*memptr) 
-		return 0;
-	else	
-		return ENOMEM;
+	// sanity check: alignment should a power of 2 and more then 2
+	if ( alignment < desc_size ||
!is_power_of_2((uint32_t)alignment) )
+		return -EINVAL;
+
+	// calculate size, needed for aligned allocation
+	aligned_size = size + alignment + desc_size;
+
+	// allocate
+	real_addr = cl_zalloc(aligned_size);
+	if ( real_addr == NULL )
+		return -ENOMEM;
+
+	// calculate aligned address
+	aligned_addr = (char *)(((ULONG_PTR)(real_addr + alignment-1)) &
~(alignment - 1));
+	if ( aligned_addr < real_addr + desc_size )
+		aligned_addr += alignment;
+
+	// store the descriptor
+	*(int*)(aligned_addr - desc_size) = (int)(aligned_addr -
real_addr);
+	
+	*memptr = aligned_addr;
+	return 0;
 }
 
+// there is no such POSIX function. Called so to be similar to the
allocation one.
+static inline void posix_memfree(void *memptr)
+{
+	int *desc_addr = (int*)((char*)memptr - sizeof(int));
+	char *real_addr = (char*)memptr - *desc_addr;
+	cl_free(real_addr);
+}
+
 static inline int ffsl(uint32_t x)
 {
        int r = 0;
Index: hw/mlx4/user/hca/qp.c
===================================================================
--- hw/mlx4/user/hca/qp.c	(revision 1294)
+++ hw/mlx4/user/hca/qp.c	(working copy)
@@ -685,7 +685,6 @@
 		return -1;
 	}
 
-	memset(qp->buf.buf, 0, qp->buf_size);
 	mlx4_qp_init_sq_ownership(qp);
 
 	return 0;
Index: hw/mlx4/user/hca/srq.c
===================================================================
--- hw/mlx4/user/hca/srq.c	(revision 1294)
+++ hw/mlx4/user/hca/srq.c	(working copy)
@@ -146,8 +146,6 @@
 		return -1;
 	}
 
-	// srq->buf.buf is zeroed in posix_memalign -
memset(srq->buf.buf, 0, buf_size);
-
 	/*
 	 * Now initialize the SRQ buffer so that all of the WQEs are
 	 * linked into the list of free WQEs.
Index: hw/mlx4/user/hca/verbs.c
===================================================================
--- hw/mlx4/user/hca/verbs.c	(revision 1294)
+++ hw/mlx4/user/hca/verbs.c	(working copy)
@@ -373,8 +373,6 @@
 						context->page_size))
 		goto err_alloc_buf;
 
-	// cq->buf.buf is zeroed in posix_memalign - memset(cq->buf.buf,
0, buf_size);
-
 	cq->ibv_cq.context = context;
 	cq->cons_index = 0;
 		
@@ -718,7 +716,7 @@
 	attr.cap.max_recv_wr		= p_create_attr->rq_depth;
 	attr.cap.max_send_sge		= p_create_attr->sq_sge;
 	attr.cap.max_recv_sge		= p_create_attr->rq_sge;
-	attr.cap.max_inline_data	= p_create_attr->sq_max_inline;
/* absent in IBAL */
+	attr.cap.max_inline_data	= p_create_attr->sq_max_inline;
 	attr.qp_type				=
__to_qp_type(p_create_attr->qp_type);
 	attr.sq_sig_all				=
p_create_attr->sq_signaled;
 
Index: hw/mthca/user/mlnx_ual_srq.c
===================================================================
--- hw/mthca/user/mlnx_ual_srq.c	(revision 1294)
+++ hw/mthca/user/mlnx_ual_srq.c	(working copy)
@@ -54,11 +54,7 @@
 	}
 
 	if (srq->buf) {
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-		cl_free(srq->buf);
-#else
-		VirtualFree( srq->buf, 0, MEM_RELEASE);
-#endif
+		posix_memfree(srq->buf);
 	}
 
 	if (srq->wrid) 
@@ -158,11 +154,7 @@
 	goto end;
 
 err_alloc_db:
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-		cl_free(srq->buf);
-#else
-		VirtualFree( srq->buf, 0, MEM_RELEASE);
-#endif
+	posix_memfree(srq->buf);
 	cl_free(srq->wrid);
 err_alloc_buf:
 	cl_spinlock_destroy(&srq->lock);
Index: hw/mthca/user/mlnx_uvp_memfree.c
===================================================================
--- hw/mthca/user/mlnx_uvp_memfree.c	(revision 1294)
+++ hw/mthca/user/mlnx_uvp_memfree.c	(working copy)
@@ -201,11 +201,7 @@
 
 	for (i = 0; i < db_tab->npages; ++i)
 		if (db_tab->page[i].db_rec)
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-			cl_free(db_tab->page[i].db_rec);
-#else
-			VirtualFree( db_tab->page[i].db_rec, 0,
MEM_RELEASE);
-#endif
+			posix_memfree( db_tab->page[i].db_rec);
 
 	cl_free(db_tab);
 }
Index: hw/mthca/user/mlnx_uvp_verbs.c
===================================================================
--- hw/mthca/user/mlnx_uvp_verbs.c	(revision 1294)
+++ hw/mthca/user/mlnx_uvp_verbs.c	(working copy)
@@ -80,11 +80,7 @@
 		WaitForSingleObject( pd->ah_mutex, INFINITE );
 		for (page = pd->ah_list; page; page = next_page) {
 			next_page = page->next;
-			#ifdef NOT_USE_VIRTUAL_ALLOC	
-				cl_free(page->buf);
-			#else
-				VirtualFree( page->buf, 0, MEM_RELEASE);
-			#endif
+			posix_memfree(page->buf);
 			cl_free(page);
 		}
 		ReleaseMutex( pd->ah_mutex );
@@ -181,7 +177,7 @@
 			cq->set_ci_db_index);
 
 err_unreg:
-	cl_free(cq->buf);
+	posix_memfree(cq->buf);
 
 err_memalign:
 	cl_spinlock_destroy(&cq->lock);
@@ -233,12 +229,7 @@
 			      to_mcq(cq)->arm_db_index);
 	}
 
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-	cl_free(to_mcq(cq)->buf);
-#else
-	VirtualFree( to_mcq(cq)->buf, 0, MEM_RELEASE);
-#endif
-
+	posix_memfree(to_mcq(cq)->buf);
 	
 	cl_spinlock_destroy(&((struct mthca_cq *)cq)->lock);
 	cl_free(to_mcq(cq));
@@ -380,11 +371,7 @@
 	
 err_spinlock_sq:
 	cl_free(qp->wrid);
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-	cl_free(qp->buf);
-#else
-	VirtualFree( qp->buf, 0, MEM_RELEASE);
-#endif
+	posix_memfree(qp->buf);
 
 err_nomem:
 	cl_free(qp);
@@ -501,11 +488,7 @@
 		cl_spinlock_destroy(&((struct mthca_qp *)qp)->sq.lock);
 		cl_spinlock_destroy(&((struct mthca_qp *)qp)->rq.lock);
 
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-		cl_free(to_mqp(qp)->buf);
-#else
-		VirtualFree( to_mqp(qp)->buf, 0, MEM_RELEASE);
-#endif
+		posix_memfree(to_mqp(qp)->buf);
 		cl_free(to_mqp(qp)->wrid);
 		cl_free(to_mqp(qp));
 	}
Index: hw/mthca/user/mt_l2w.h
===================================================================
--- hw/mthca/user/mt_l2w.h	(revision 1294)
+++ hw/mthca/user/mt_l2w.h	(working copy)
@@ -52,32 +52,49 @@
 
 extern size_t g_page_size;
 
-static inline int posix_memalign(void **memptr, size_t alignment,
size_t size)
+static inline BOOLEAN is_power_of_2(uint32_t n)
 {
-#ifdef NOT_USE_VIRTUAL_ALLOC	
-	// sanity checks
-	if (alignment % sizeof(void*))
-		return EINVAL;
-	if (alignment < g_page_size) {
-		fprintf(stderr, "mthca: Fatal (posix_memalign):
alignment too small - %d \n",  alignment );
-		return EINVAL;
-	}
+	return (!!n & !(n & (n-1))) ? TRUE : FALSE;
+}
 
-	// allocation
-	*memptr = cl_malloc(size);
-	if (*memptr) 
-		return 0;
-	else	
-		return ENOMEM;
-#else
-	*memptr = VirtualAlloc( NULL, size, MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE );
-	if (*memptr) 
-		return 0;
-	else	
-		return ENOMEM;
-#endif
+// Allocated memory is zeroed !
+static inline int posix_memalign(void **memptr, int alignment, int
size)
+{
+	int aligned_size, desc_size = sizeof(int);
+	char *real_addr, *aligned_addr;
+
+	// sanity check: alignment should a power of 2 and more then 2
+	if ( alignment < desc_size ||
!is_power_of_2((uint32_t)alignment) )
+		return -EINVAL;
+
+	// calculate size, needed for aligned allocation
+	aligned_size = size + alignment + desc_size;
+
+	// allocate
+	real_addr = cl_zalloc(aligned_size);
+	if ( real_addr == NULL )
+		return -ENOMEM;
+
+	// calculate aligned address
+	aligned_addr = (char *)(((ULONG_PTR)(real_addr + alignment-1)) &
~(alignment - 1));
+	if ( aligned_addr < real_addr + desc_size )
+		aligned_addr += alignment;
+
+	// store the descriptor
+	*(int*)(aligned_addr - desc_size) = (int)(aligned_addr -
real_addr);
+	
+	*memptr = aligned_addr;
+	return 0;
 }
 
+// there is no such POSIX function. Called so to be similar to the
allocation one.
+static inline void posix_memfree(void *memptr)
+{
+	int *desc_addr = (int*)((char*)memptr - sizeof(int));
+	char *real_addr = (char*)memptr - *desc_addr;
+	cl_free(real_addr);
+}
+
 // ===========================================
 // FUNCTIONS
 // ===========================================




> -----Original Message-----
> From: ofw-bounces at lists.openfabrics.org 
> [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Leonid Keller
> Sent: Tuesday, June 24, 2008 1:14 PM
> To: Fab Tillier; ofw at lists.openfabrics.org
> Subject: [ofw] RE: [HW] memory allocation improvement in user space
> 
> See inline 
> 
> > -----Original Message-----
> > From: Fab Tillier [mailto:ftillier at windows.microsoft.com]
> > Sent: Monday, June 23, 2008 8:55 PM
> > To: Leonid Keller; ofw at lists.openfabrics.org
> > Subject: RE: [HW] memory allocation improvement in user space
> > 
> > Hi Leo,
> > 
> > >From: Leonid Keller
> > >Sent: Monday, June 23, 2008 6:44 AM
> > >
> > >Investigation of INSUFFICIENT_MEMORY failures on our stress tests 
> > >brought us to "revelation", that VirtualAlloc function, used for 
> > >implementation of posix_memalign, is a very "greedy" one: it
> > allocates
> > >at least 64KB memory.
> > >As far as we usually ask one page, it is 16 times more than
> > necessary.
> > 
> > VirtualAlloc doesn't actually allocate 64K of memory, but it does 
> > reserve 64K of virtual address space.  It then commits a 
> single page.  
> > This article:
> > http://msdn.microsoft.com/en-us/library/ms810627.aspx
> > explains this behavior: "The minimum size that can be 
> reserved is 64K 
> > ... Requesting one page of reserved addresses results in a 
> 64K address 
> > range" in the section about reserving memory.
> 
> I know that. But using reservation and allocation separately 
> requires a complicate housekeeping system for one and we do 
> not always need allocate integral pages for two.
> > 
> > >Presented below a patch, which implements posix_memalign with
> > >(ultimately) HeapAlloc functions.
> > >The patch was tested and worked OK.
> > >
> > >An important nuance, that was revealed during testing is 
> as follows:
> > >A system function, which releases the resources of an
> > exiting process,
> > >damages in some way the work of MmSecureVirtualMemory
> > function, which
> > >we use today to secure CQ\QP\SRQ circular buffers and user buffers.
> > >If an application gets killed or exits without releasing the
> > resources,
> > >IBBUS catches this event, starts its cascading destroy of
> > resources and
> > >crashes on MmUnsecureVirtualMemory.
> > >Putting MmUnsecureVirtualMemory in try-except block saves from the 
> > >crash, but an async thread, releasing QPs, freezes on 
> > >MmUnsecureVirtualMemory, which fails to get some mutex.
> > 
> > Maybe the first exception didn't release the mutex?
> 
> Maybe. It's not my mutex. It's theirs, internal one.
> The only change that I did (in user space) was to allocate 
> memory in the Heap and not as separate pages.
> MmSecureVirtualMemory/MmUnsecureVirtualMemory pair works on 
> the good flow, when the resources are released.
> MmUnsecureVirtualMemory fails with exception only when 
> application exits without releasing the resources.
> 
> > 
> > >As far as there is no real reason to secure circular  
> buffers, i've 
> > >solved the problem by skipping securing for IB objects.
> > >User buffers are still secured while memory registration.
> > >
> 
> BTW, DDK *doesn't* recommend to use this function.
> Its intention is to keep a user buffer from freeing or 
> changing access rights.
> We do not need this functionality for circular buffers anyway.
> 
> > >Therefore the patch contains 3 kinds of changes:
> > >1) new implementation of posix_memalign and all related to that;
> > >2) try-except block around MmUnsecureVirtualMemory;
> > 
> > I'd be weary of just working around the issue without understanding 
> > why it crashes, especially since you found some negative 
> ramifications 
> > to this (the subsequent hang).  What if there's some memory 
> corruption 
> > problem in the UVP - shouldn't that be fixed rather than hidden?
> > 
> > >3) new parameter in ib_umem_get and mthca_reg_virt_mr for skipping 
> > >memory securing and all related to it;
> > 
> > -Fab
> > 
> 
> 
> --
> Leonid Keller
> Mellanox Technologies LTD.
> SW- Windows
> Phone: +972 (4) 909 7200 (ext 372)
> Mobile: +972 (54) 464 7702
> E-mail: leonid at mellanox.co.il
> 
> ----------------------------------------------------------------------
> Emails belong on computers, trees belong in forests; if you 
> must print this, do it on recycled paper.
> http://www.greenpeace.org/international/
> ----------------------------------------------------------------------
> 
> 
> Disclaimer added by CodeTwo Exchange Rules 
> http://www.codetwo.com _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> 


--
Leonid Keller
Mellanox Technologies LTD.
SW- Windows
Phone: +972 (4) 909 7200 (ext 372)
Mobile: +972 (54) 464 7702
E-mail: leonid at mellanox.co.il

----------------------------------------------------------------------
Emails belong on computers, trees belong in forests; if you must print
this, do it on recycled paper.
http://www.greenpeace.org/international/
----------------------------------------------------------------------


Disclaimer added by CodeTwo Exchange Rules
http://www.codetwo.com
-------------- next part --------------
A non-text attachment was scrubbed...
Name: improve_memory_user.patch
Type: application/octet-stream
Size: 8927 bytes
Desc: improve_memory_user.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080624/d26854ad/attachment.obj>


More information about the ofw mailing list