[ofw] RE: [HW] memory allocation improvement in user space

Leonid Keller leonid at mellanox.co.il
Sun Jun 29 09:05:50 PDT 2008


Resending ... 

> -----Original Message-----
> From: Leonid Keller 
> Sent: Tuesday, June 24, 2008 1:47 PM
> To: Leonid Keller; Fab Tillier; ofw at lists.openfabrics.org
> Subject: RE: [ofw] RE: [HW] memory allocation improvement in 
> user space
> 
> As an afterthought.
> 
> I quote below only the user part of the patch as it was in 
> the first place.
> Maybe you can notice some (inserted bug) of not-releasing or 
> incorrect work with memory, which caused the problems with 
> MmUnsecureVirtualMemory.
> 
> Index: hw/mlx4/user/hca/buf.c
> ===================================================================
> --- hw/mlx4/user/hca/buf.c	(revision 1294)
> +++ hw/mlx4/user/hca/buf.c	(working copy)
> @@ -35,17 +35,13 @@
>  int mlx4_alloc_buf(struct mlx4_buf *buf, int size, int page_size)  {
>  	int ret;
> -
>  	ret = posix_memalign(&buf->buf, page_size, align(size, 
> page_size));
> -	if (ret)
> -		return ret;
> -
> -	buf->length = size;
> -
> -	return 0;
> +	if (!ret)
> +		buf->length = size;
> +	return ret;
>  }
>  
>  void mlx4_free_buf(struct mlx4_buf *buf)  {
> -	VirtualFree(buf->buf, 0, MEM_RELEASE);
> +	posix_memfree(buf->buf);
>  }
> Index: hw/mlx4/user/hca/l2w.h
> ===================================================================
> --- hw/mlx4/user/hca/l2w.h	(revision 1294)
> +++ hw/mlx4/user/hca/l2w.h	(working copy)
> @@ -74,17 +74,49 @@
>  // FUNCTIONS
>  // ===========================================
>  
> +static inline BOOLEAN is_power_of_2(uint32_t n) {
> +	return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
> +
> +// Allocated memory is zeroed !
>  static inline int posix_memalign(void **memptr, int 
> alignment, int size)  {
> -	UNREFERENCED_PARAMETER(alignment);
> +	int aligned_size, desc_size = sizeof(int);
> +	char *real_addr, *aligned_addr;
>  
> -	*memptr = VirtualAlloc( NULL, size, MEM_COMMIT | 
> MEM_RESERVE,  PAGE_READWRITE );
> -	if (*memptr) 
> -		return 0;
> -	else	
> -		return ENOMEM;
> +	// sanity check: alignment should a power of 2 and more then 2
> +	if ( alignment < desc_size || 
> !is_power_of_2((uint32_t)alignment) )
> +		return -EINVAL;
> +
> +	// calculate size, needed for aligned allocation
> +	aligned_size = size + alignment + desc_size;
> +
> +	// allocate
> +	real_addr = cl_zalloc(aligned_size);
> +	if ( real_addr == NULL )
> +		return -ENOMEM;
> +
> +	// calculate aligned address
> +	aligned_addr = (char *)(((ULONG_PTR)(real_addr + 
> alignment-1)) & ~(alignment - 1));
> +	if ( aligned_addr < real_addr + desc_size )
> +		aligned_addr += alignment;
> +
> +	// store the descriptor
> +	*(int*)(aligned_addr - desc_size) = (int)(aligned_addr 
> - real_addr);
> +	
> +	*memptr = aligned_addr;
> +	return 0;
>  }
>  
> +// there is no such POSIX function. Called so to be similar 
> to the allocation one.
> +static inline void posix_memfree(void *memptr) {
> +	int *desc_addr = (int*)((char*)memptr - sizeof(int));
> +	char *real_addr = (char*)memptr - *desc_addr;
> +	cl_free(real_addr);
> +}
> +
>  static inline int ffsl(uint32_t x)
>  {
>         int r = 0;
> Index: hw/mlx4/user/hca/qp.c
> ===================================================================
> --- hw/mlx4/user/hca/qp.c	(revision 1294)
> +++ hw/mlx4/user/hca/qp.c	(working copy)
> @@ -685,7 +685,6 @@
>  		return -1;
>  	}
>  
> -	memset(qp->buf.buf, 0, qp->buf_size);
>  	mlx4_qp_init_sq_ownership(qp);
>  
>  	return 0;
> Index: hw/mlx4/user/hca/srq.c
> ===================================================================
> --- hw/mlx4/user/hca/srq.c	(revision 1294)
> +++ hw/mlx4/user/hca/srq.c	(working copy)
> @@ -146,8 +146,6 @@
>  		return -1;
>  	}
>  
> -	// srq->buf.buf is zeroed in posix_memalign - 
> memset(srq->buf.buf, 0, buf_size);
> -
>  	/*
>  	 * Now initialize the SRQ buffer so that all of the WQEs are
>  	 * linked into the list of free WQEs.
> Index: hw/mlx4/user/hca/verbs.c
> ===================================================================
> --- hw/mlx4/user/hca/verbs.c	(revision 1294)
> +++ hw/mlx4/user/hca/verbs.c	(working copy)
> @@ -373,8 +373,6 @@
>  						context->page_size))
>  		goto err_alloc_buf;
>  
> -	// cq->buf.buf is zeroed in posix_memalign - 
> memset(cq->buf.buf, 0, buf_size);
> -
>  	cq->ibv_cq.context = context;
>  	cq->cons_index = 0;
>  		
> @@ -718,7 +716,7 @@
>  	attr.cap.max_recv_wr		= p_create_attr->rq_depth;
>  	attr.cap.max_send_sge		= p_create_attr->sq_sge;
>  	attr.cap.max_recv_sge		= p_create_attr->rq_sge;
> -	attr.cap.max_inline_data	= 
> p_create_attr->sq_max_inline;		/* absent in IBAL */
> +	attr.cap.max_inline_data	= p_create_attr->sq_max_inline;
>  	attr.qp_type				= 
> __to_qp_type(p_create_attr->qp_type);
>  	attr.sq_sig_all				= 
> p_create_attr->sq_signaled;
>  
> Index: hw/mthca/user/mlnx_ual_srq.c
> ===================================================================
> --- hw/mthca/user/mlnx_ual_srq.c	(revision 1294)
> +++ hw/mthca/user/mlnx_ual_srq.c	(working copy)
> @@ -54,11 +54,7 @@
>  	}
>  
>  	if (srq->buf) {
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -		cl_free(srq->buf);
> -#else
> -		VirtualFree( srq->buf, 0, MEM_RELEASE);
> -#endif
> +		posix_memfree(srq->buf);
>  	}
>  
>  	if (srq->wrid)
> @@ -158,11 +154,7 @@
>  	goto end;
>  
>  err_alloc_db:
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -		cl_free(srq->buf);
> -#else
> -		VirtualFree( srq->buf, 0, MEM_RELEASE);
> -#endif
> +	posix_memfree(srq->buf);
>  	cl_free(srq->wrid);
>  err_alloc_buf:
>  	cl_spinlock_destroy(&srq->lock);
> Index: hw/mthca/user/mlnx_uvp_memfree.c
> ===================================================================
> --- hw/mthca/user/mlnx_uvp_memfree.c	(revision 1294)
> +++ hw/mthca/user/mlnx_uvp_memfree.c	(working copy)
> @@ -201,11 +201,7 @@
>  
>  	for (i = 0; i < db_tab->npages; ++i)
>  		if (db_tab->page[i].db_rec)
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -			cl_free(db_tab->page[i].db_rec);
> -#else
> -			VirtualFree( db_tab->page[i].db_rec, 0, 
> MEM_RELEASE);
> -#endif
> +			posix_memfree( db_tab->page[i].db_rec);
>  
>  	cl_free(db_tab);
>  }
> Index: hw/mthca/user/mlnx_uvp_verbs.c
> ===================================================================
> --- hw/mthca/user/mlnx_uvp_verbs.c	(revision 1294)
> +++ hw/mthca/user/mlnx_uvp_verbs.c	(working copy)
> @@ -80,11 +80,7 @@
>  		WaitForSingleObject( pd->ah_mutex, INFINITE );
>  		for (page = pd->ah_list; page; page = next_page) {
>  			next_page = page->next;
> -			#ifdef NOT_USE_VIRTUAL_ALLOC	
> -				cl_free(page->buf);
> -			#else
> -				VirtualFree( page->buf, 0, MEM_RELEASE);
> -			#endif
> +			posix_memfree(page->buf);
>  			cl_free(page);
>  		}
>  		ReleaseMutex( pd->ah_mutex );
> @@ -181,7 +177,7 @@
>  			cq->set_ci_db_index);
>  
>  err_unreg:
> -	cl_free(cq->buf);
> +	posix_memfree(cq->buf);
>  
>  err_memalign:
>  	cl_spinlock_destroy(&cq->lock);
> @@ -233,12 +229,7 @@
>  			      to_mcq(cq)->arm_db_index);
>  	}
>  
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -	cl_free(to_mcq(cq)->buf);
> -#else
> -	VirtualFree( to_mcq(cq)->buf, 0, MEM_RELEASE);
> -#endif
> -
> +	posix_memfree(to_mcq(cq)->buf);
>  	
>  	cl_spinlock_destroy(&((struct mthca_cq *)cq)->lock);
>  	cl_free(to_mcq(cq));
> @@ -380,11 +371,7 @@
>  	
>  err_spinlock_sq:
>  	cl_free(qp->wrid);
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -	cl_free(qp->buf);
> -#else
> -	VirtualFree( qp->buf, 0, MEM_RELEASE);
> -#endif
> +	posix_memfree(qp->buf);
>  
>  err_nomem:
>  	cl_free(qp);
> @@ -501,11 +488,7 @@
>  		cl_spinlock_destroy(&((struct mthca_qp *)qp)->sq.lock);
>  		cl_spinlock_destroy(&((struct mthca_qp *)qp)->rq.lock);
>  
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -		cl_free(to_mqp(qp)->buf);
> -#else
> -		VirtualFree( to_mqp(qp)->buf, 0, MEM_RELEASE);
> -#endif
> +		posix_memfree(to_mqp(qp)->buf);
>  		cl_free(to_mqp(qp)->wrid);
>  		cl_free(to_mqp(qp));
>  	}
> Index: hw/mthca/user/mt_l2w.h
> ===================================================================
> --- hw/mthca/user/mt_l2w.h	(revision 1294)
> +++ hw/mthca/user/mt_l2w.h	(working copy)
> @@ -52,32 +52,49 @@
>  
>  extern size_t g_page_size;
>  
> -static inline int posix_memalign(void **memptr, size_t 
> alignment, size_t size)
> +static inline BOOLEAN is_power_of_2(uint32_t n)
>  {
> -#ifdef NOT_USE_VIRTUAL_ALLOC	
> -	// sanity checks
> -	if (alignment % sizeof(void*))
> -		return EINVAL;
> -	if (alignment < g_page_size) {
> -		fprintf(stderr, "mthca: Fatal (posix_memalign): 
> alignment too small - %d \n",  alignment );
> -		return EINVAL;
> -	}
> +	return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
>  
> -	// allocation
> -	*memptr = cl_malloc(size);
> -	if (*memptr) 
> -		return 0;
> -	else	
> -		return ENOMEM;
> -#else
> -	*memptr = VirtualAlloc( NULL, size, MEM_COMMIT | 
> MEM_RESERVE,  PAGE_READWRITE );
> -	if (*memptr) 
> -		return 0;
> -	else	
> -		return ENOMEM;
> -#endif
> +// Allocated memory is zeroed !
> +static inline int posix_memalign(void **memptr, int alignment, int 
> +size) {
> +	int aligned_size, desc_size = sizeof(int);
> +	char *real_addr, *aligned_addr;
> +
> +	// sanity check: alignment should a power of 2 and more then 2
> +	if ( alignment < desc_size || 
> !is_power_of_2((uint32_t)alignment) )
> +		return -EINVAL;
> +
> +	// calculate size, needed for aligned allocation
> +	aligned_size = size + alignment + desc_size;
> +
> +	// allocate
> +	real_addr = cl_zalloc(aligned_size);
> +	if ( real_addr == NULL )
> +		return -ENOMEM;
> +
> +	// calculate aligned address
> +	aligned_addr = (char *)(((ULONG_PTR)(real_addr + 
> alignment-1)) & ~(alignment - 1));
> +	if ( aligned_addr < real_addr + desc_size )
> +		aligned_addr += alignment;
> +
> +	// store the descriptor
> +	*(int*)(aligned_addr - desc_size) = (int)(aligned_addr 
> - real_addr);
> +	
> +	*memptr = aligned_addr;
> +	return 0;
>  }
>  
> +// there is no such POSIX function. Called so to be similar 
> to the allocation one.
> +static inline void posix_memfree(void *memptr) {
> +	int *desc_addr = (int*)((char*)memptr - sizeof(int));
> +	char *real_addr = (char*)memptr - *desc_addr;
> +	cl_free(real_addr);
> +}
> +
>  // ===========================================
>  // FUNCTIONS
>  // ===========================================
> 
> 
> 
> 
> > -----Original Message-----
> > From: ofw-bounces at lists.openfabrics.org 
> > [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of 
> Leonid Keller
> > Sent: Tuesday, June 24, 2008 1:14 PM
> > To: Fab Tillier; ofw at lists.openfabrics.org
> > Subject: [ofw] RE: [HW] memory allocation improvement in user space
> > 
> > See inline
> > 
> > > -----Original Message-----
> > > From: Fab Tillier [mailto:ftillier at windows.microsoft.com]
> > > Sent: Monday, June 23, 2008 8:55 PM
> > > To: Leonid Keller; ofw at lists.openfabrics.org
> > > Subject: RE: [HW] memory allocation improvement in user space
> > > 
> > > Hi Leo,
> > > 
> > > >From: Leonid Keller
> > > >Sent: Monday, June 23, 2008 6:44 AM
> > > >
> > > >Investigation of INSUFFICIENT_MEMORY failures on our 
> stress tests 
> > > >brought us to "revelation", that VirtualAlloc function, used for 
> > > >implementation of posix_memalign, is a very "greedy" one: it
> > > allocates
> > > >at least 64KB memory.
> > > >As far as we usually ask one page, it is 16 times more than
> > > necessary.
> > > 
> > > VirtualAlloc doesn't actually allocate 64K of memory, but it does 
> > > reserve 64K of virtual address space.  It then commits a
> > single page.  
> > > This article:
> > > http://msdn.microsoft.com/en-us/library/ms810627.aspx
> > > explains this behavior: "The minimum size that can be
> > reserved is 64K
> > > ... Requesting one page of reserved addresses results in a
> > 64K address
> > > range" in the section about reserving memory.
> > 
> > I know that. But using reservation and allocation 
> separately requires 
> > a complicate housekeeping system for one and we do not always need 
> > allocate integral pages for two.
> > > 
> > > >Presented below a patch, which implements posix_memalign with
> > > >(ultimately) HeapAlloc functions.
> > > >The patch was tested and worked OK.
> > > >
> > > >An important nuance, that was revealed during testing is
> > as follows:
> > > >A system function, which releases the resources of an
> > > exiting process,
> > > >damages in some way the work of MmSecureVirtualMemory
> > > function, which
> > > >we use today to secure CQ\QP\SRQ circular buffers and 
> user buffers.
> > > >If an application gets killed or exits without releasing the
> > > resources,
> > > >IBBUS catches this event, starts its cascading destroy of
> > > resources and
> > > >crashes on MmUnsecureVirtualMemory.
> > > >Putting MmUnsecureVirtualMemory in try-except block 
> saves from the 
> > > >crash, but an async thread, releasing QPs, freezes on 
> > > >MmUnsecureVirtualMemory, which fails to get some mutex.
> > > 
> > > Maybe the first exception didn't release the mutex?
> > 
> > Maybe. It's not my mutex. It's theirs, internal one.
> > The only change that I did (in user space) was to allocate 
> memory in 
> > the Heap and not as separate pages.
> > MmSecureVirtualMemory/MmUnsecureVirtualMemory pair works on 
> the good 
> > flow, when the resources are released.
> > MmUnsecureVirtualMemory fails with exception only when application 
> > exits without releasing the resources.
> > 
> > > 
> > > >As far as there is no real reason to secure circular
> > buffers, i've
> > > >solved the problem by skipping securing for IB objects.
> > > >User buffers are still secured while memory registration.
> > > >
> > 
> > BTW, DDK *doesn't* recommend to use this function.
> > Its intention is to keep a user buffer from freeing or 
> changing access 
> > rights.
> > We do not need this functionality for circular buffers anyway.
> > 
> > > >Therefore the patch contains 3 kinds of changes:
> > > >1) new implementation of posix_memalign and all related to that;
> > > >2) try-except block around MmUnsecureVirtualMemory;
> > > 
> > > I'd be weary of just working around the issue without 
> understanding 
> > > why it crashes, especially since you found some negative
> > ramifications
> > > to this (the subsequent hang).  What if there's some memory
> > corruption
> > > problem in the UVP - shouldn't that be fixed rather than hidden?
> > > 
> > > >3) new parameter in ib_umem_get and mthca_reg_virt_mr 
> for skipping 
> > > >memory securing and all related to it;
> > > 
> > > -Fab
> > > 
> > 
> > 
> > --
> > Leonid Keller
> > Mellanox Technologies LTD.
> > SW- Windows
> > Phone: +972 (4) 909 7200 (ext 372)
> > Mobile: +972 (54) 464 7702
> > E-mail: leonid at mellanox.co.il
> > 
> > 
> ----------------------------------------------------------------------
> > Emails belong on computers, trees belong in forests; if you 
> must print 
> > this, do it on recycled paper.
> > http://www.greenpeace.org/international/
> > 
> ----------------------------------------------------------------------
> > 
> > 
> > Disclaimer added by CodeTwo Exchange Rules http://www.codetwo.com 
> > _______________________________________________
> > ofw mailing list
> > ofw at lists.openfabrics.org
> > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> > 



More information about the ofw mailing list