[ofw] RE: [HW] memory allocation improvement in user space

Leonid Keller leonid at mellanox.co.il
Tue Jul 1 03:34:46 PDT 2008


Applied in 1311. 

> -----Original Message-----
> From: Leonid Keller 
> Sent: Sunday, June 29, 2008 7:06 PM
> To: Leonid Keller; Fab Tillier; ofw at lists.openfabrics.org
> Subject: RE: [ofw] RE: [HW] memory allocation improvement in 
> user space
> 
> Resending ... 
> 
> > -----Original Message-----
> > From: Leonid Keller
> > Sent: Tuesday, June 24, 2008 1:47 PM
> > To: Leonid Keller; Fab Tillier; ofw at lists.openfabrics.org
> > Subject: RE: [ofw] RE: [HW] memory allocation improvement in user 
> > space
> > 
> > As an afterthought.
> > 
> > I quote below only the user part of the patch as it was in 
> the first 
> > place.
> > Maybe you can notice some (inserted bug) of not-releasing 
> or incorrect 
> > work with memory, which caused the problems with 
> > MmUnsecureVirtualMemory.
> > 
> > Index: hw/mlx4/user/hca/buf.c
> > ===================================================================
> > --- hw/mlx4/user/hca/buf.c	(revision 1294)
> > +++ hw/mlx4/user/hca/buf.c	(working copy)
> > @@ -35,17 +35,13 @@
> >  int mlx4_alloc_buf(struct mlx4_buf *buf, int size, int 
> page_size)  {
> >  	int ret;
> > -
> >  	ret = posix_memalign(&buf->buf, page_size, align(size, 
> page_size));
> > -	if (ret)
> > -		return ret;
> > -
> > -	buf->length = size;
> > -
> > -	return 0;
> > +	if (!ret)
> > +		buf->length = size;
> > +	return ret;
> >  }
> >  
> >  void mlx4_free_buf(struct mlx4_buf *buf)  {
> > -	VirtualFree(buf->buf, 0, MEM_RELEASE);
> > +	posix_memfree(buf->buf);
> >  }
> > Index: hw/mlx4/user/hca/l2w.h
> > ===================================================================
> > --- hw/mlx4/user/hca/l2w.h	(revision 1294)
> > +++ hw/mlx4/user/hca/l2w.h	(working copy)
> > @@ -74,17 +74,49 @@
> >  // FUNCTIONS
> >  // ===========================================
> >  
> > +static inline BOOLEAN is_power_of_2(uint32_t n) {
> > +	return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
> > +
> > +// Allocated memory is zeroed !
> >  static inline int posix_memalign(void **memptr, int alignment, int 
> > size)  {
> > -	UNREFERENCED_PARAMETER(alignment);
> > +	int aligned_size, desc_size = sizeof(int);
> > +	char *real_addr, *aligned_addr;
> >  
> > -	*memptr = VirtualAlloc( NULL, size, MEM_COMMIT | 
> > MEM_RESERVE,  PAGE_READWRITE );
> > -	if (*memptr) 
> > -		return 0;
> > -	else	
> > -		return ENOMEM;
> > +	// sanity check: alignment should a power of 2 and more then 2
> > +	if ( alignment < desc_size ||
> > !is_power_of_2((uint32_t)alignment) )
> > +		return -EINVAL;
> > +
> > +	// calculate size, needed for aligned allocation
> > +	aligned_size = size + alignment + desc_size;
> > +
> > +	// allocate
> > +	real_addr = cl_zalloc(aligned_size);
> > +	if ( real_addr == NULL )
> > +		return -ENOMEM;
> > +
> > +	// calculate aligned address
> > +	aligned_addr = (char *)(((ULONG_PTR)(real_addr +
> > alignment-1)) & ~(alignment - 1));
> > +	if ( aligned_addr < real_addr + desc_size )
> > +		aligned_addr += alignment;
> > +
> > +	// store the descriptor
> > +	*(int*)(aligned_addr - desc_size) = (int)(aligned_addr
> > - real_addr);
> > +	
> > +	*memptr = aligned_addr;
> > +	return 0;
> >  }
> >  
> > +// there is no such POSIX function. Called so to be similar
> > to the allocation one.
> > +static inline void posix_memfree(void *memptr) {
> > +	int *desc_addr = (int*)((char*)memptr - sizeof(int));
> > +	char *real_addr = (char*)memptr - *desc_addr;
> > +	cl_free(real_addr);
> > +}
> > +
> >  static inline int ffsl(uint32_t x)
> >  {
> >         int r = 0;
> > Index: hw/mlx4/user/hca/qp.c
> > ===================================================================
> > --- hw/mlx4/user/hca/qp.c	(revision 1294)
> > +++ hw/mlx4/user/hca/qp.c	(working copy)
> > @@ -685,7 +685,6 @@
> >  		return -1;
> >  	}
> >  
> > -	memset(qp->buf.buf, 0, qp->buf_size);
> >  	mlx4_qp_init_sq_ownership(qp);
> >  
> >  	return 0;
> > Index: hw/mlx4/user/hca/srq.c
> > ===================================================================
> > --- hw/mlx4/user/hca/srq.c	(revision 1294)
> > +++ hw/mlx4/user/hca/srq.c	(working copy)
> > @@ -146,8 +146,6 @@
> >  		return -1;
> >  	}
> >  
> > -	// srq->buf.buf is zeroed in posix_memalign - 
> > memset(srq->buf.buf, 0, buf_size);
> > -
> >  	/*
> >  	 * Now initialize the SRQ buffer so that all of the WQEs are
> >  	 * linked into the list of free WQEs.
> > Index: hw/mlx4/user/hca/verbs.c
> > ===================================================================
> > --- hw/mlx4/user/hca/verbs.c	(revision 1294)
> > +++ hw/mlx4/user/hca/verbs.c	(working copy)
> > @@ -373,8 +373,6 @@
> >  						context->page_size))
> >  		goto err_alloc_buf;
> >  
> > -	// cq->buf.buf is zeroed in posix_memalign - 
> > memset(cq->buf.buf, 0, buf_size);
> > -
> >  	cq->ibv_cq.context = context;
> >  	cq->cons_index = 0;
> >  		
> > @@ -718,7 +716,7 @@
> >  	attr.cap.max_recv_wr		= p_create_attr->rq_depth;
> >  	attr.cap.max_send_sge		= p_create_attr->sq_sge;
> >  	attr.cap.max_recv_sge		= p_create_attr->rq_sge;
> > -	attr.cap.max_inline_data	= 
> > p_create_attr->sq_max_inline;		/* absent in IBAL */
> > +	attr.cap.max_inline_data	= p_create_attr->sq_max_inline;
> >  	attr.qp_type				= 
> > __to_qp_type(p_create_attr->qp_type);
> >  	attr.sq_sig_all				= 
> > p_create_attr->sq_signaled;
> >  
> > Index: hw/mthca/user/mlnx_ual_srq.c
> > ===================================================================
> > --- hw/mthca/user/mlnx_ual_srq.c	(revision 1294)
> > +++ hw/mthca/user/mlnx_ual_srq.c	(working copy)
> > @@ -54,11 +54,7 @@
> >  	}
> >  
> >  	if (srq->buf) {
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -		cl_free(srq->buf);
> > -#else
> > -		VirtualFree( srq->buf, 0, MEM_RELEASE);
> > -#endif
> > +		posix_memfree(srq->buf);
> >  	}
> >  
> >  	if (srq->wrid)
> > @@ -158,11 +154,7 @@
> >  	goto end;
> >  
> >  err_alloc_db:
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -		cl_free(srq->buf);
> > -#else
> > -		VirtualFree( srq->buf, 0, MEM_RELEASE);
> > -#endif
> > +	posix_memfree(srq->buf);
> >  	cl_free(srq->wrid);
> >  err_alloc_buf:
> >  	cl_spinlock_destroy(&srq->lock);
> > Index: hw/mthca/user/mlnx_uvp_memfree.c 
> > ===================================================================
> > --- hw/mthca/user/mlnx_uvp_memfree.c	(revision 1294)
> > +++ hw/mthca/user/mlnx_uvp_memfree.c	(working copy)
> > @@ -201,11 +201,7 @@
> >  
> >  	for (i = 0; i < db_tab->npages; ++i)
> >  		if (db_tab->page[i].db_rec)
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -			cl_free(db_tab->page[i].db_rec);
> > -#else
> > -			VirtualFree( db_tab->page[i].db_rec, 0, 
> > MEM_RELEASE);
> > -#endif
> > +			posix_memfree( db_tab->page[i].db_rec);
> >  
> >  	cl_free(db_tab);
> >  }
> > Index: hw/mthca/user/mlnx_uvp_verbs.c
> > ===================================================================
> > --- hw/mthca/user/mlnx_uvp_verbs.c	(revision 1294)
> > +++ hw/mthca/user/mlnx_uvp_verbs.c	(working copy)
> > @@ -80,11 +80,7 @@
> >  		WaitForSingleObject( pd->ah_mutex, INFINITE );
> >  		for (page = pd->ah_list; page; page = next_page) {
> >  			next_page = page->next;
> > -			#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -				cl_free(page->buf);
> > -			#else
> > -				VirtualFree( page->buf, 0, MEM_RELEASE);
> > -			#endif
> > +			posix_memfree(page->buf);
> >  			cl_free(page);
> >  		}
> >  		ReleaseMutex( pd->ah_mutex );
> > @@ -181,7 +177,7 @@
> >  			cq->set_ci_db_index);
> >  
> >  err_unreg:
> > -	cl_free(cq->buf);
> > +	posix_memfree(cq->buf);
> >  
> >  err_memalign:
> >  	cl_spinlock_destroy(&cq->lock);
> > @@ -233,12 +229,7 @@
> >  			      to_mcq(cq)->arm_db_index);
> >  	}
> >  
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -	cl_free(to_mcq(cq)->buf);
> > -#else
> > -	VirtualFree( to_mcq(cq)->buf, 0, MEM_RELEASE);
> > -#endif
> > -
> > +	posix_memfree(to_mcq(cq)->buf);
> >  	
> >  	cl_spinlock_destroy(&((struct mthca_cq *)cq)->lock);
> >  	cl_free(to_mcq(cq));
> > @@ -380,11 +371,7 @@
> >  	
> >  err_spinlock_sq:
> >  	cl_free(qp->wrid);
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -	cl_free(qp->buf);
> > -#else
> > -	VirtualFree( qp->buf, 0, MEM_RELEASE);
> > -#endif
> > +	posix_memfree(qp->buf);
> >  
> >  err_nomem:
> >  	cl_free(qp);
> > @@ -501,11 +488,7 @@
> >  		cl_spinlock_destroy(&((struct mthca_qp *)qp)->sq.lock);
> >  		cl_spinlock_destroy(&((struct mthca_qp *)qp)->rq.lock);
> >  
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -		cl_free(to_mqp(qp)->buf);
> > -#else
> > -		VirtualFree( to_mqp(qp)->buf, 0, MEM_RELEASE);
> > -#endif
> > +		posix_memfree(to_mqp(qp)->buf);
> >  		cl_free(to_mqp(qp)->wrid);
> >  		cl_free(to_mqp(qp));
> >  	}
> > Index: hw/mthca/user/mt_l2w.h
> > ===================================================================
> > --- hw/mthca/user/mt_l2w.h	(revision 1294)
> > +++ hw/mthca/user/mt_l2w.h	(working copy)
> > @@ -52,32 +52,49 @@
> >  
> >  extern size_t g_page_size;
> >  
> > -static inline int posix_memalign(void **memptr, size_t alignment, 
> > size_t size)
> > +static inline BOOLEAN is_power_of_2(uint32_t n)
> >  {
> > -#ifdef NOT_USE_VIRTUAL_ALLOC	
> > -	// sanity checks
> > -	if (alignment % sizeof(void*))
> > -		return EINVAL;
> > -	if (alignment < g_page_size) {
> > -		fprintf(stderr, "mthca: Fatal (posix_memalign): 
> > alignment too small - %d \n",  alignment );
> > -		return EINVAL;
> > -	}
> > +	return (!!n & !(n & (n-1))) ? TRUE : FALSE; }
> >  
> > -	// allocation
> > -	*memptr = cl_malloc(size);
> > -	if (*memptr) 
> > -		return 0;
> > -	else	
> > -		return ENOMEM;
> > -#else
> > -	*memptr = VirtualAlloc( NULL, size, MEM_COMMIT | 
> > MEM_RESERVE,  PAGE_READWRITE );
> > -	if (*memptr) 
> > -		return 0;
> > -	else	
> > -		return ENOMEM;
> > -#endif
> > +// Allocated memory is zeroed !
> > +static inline int posix_memalign(void **memptr, int alignment, int
> > +size) {
> > +	int aligned_size, desc_size = sizeof(int);
> > +	char *real_addr, *aligned_addr;
> > +
> > +	// sanity check: alignment should a power of 2 and more then 2
> > +	if ( alignment < desc_size ||
> > !is_power_of_2((uint32_t)alignment) )
> > +		return -EINVAL;
> > +
> > +	// calculate size, needed for aligned allocation
> > +	aligned_size = size + alignment + desc_size;
> > +
> > +	// allocate
> > +	real_addr = cl_zalloc(aligned_size);
> > +	if ( real_addr == NULL )
> > +		return -ENOMEM;
> > +
> > +	// calculate aligned address
> > +	aligned_addr = (char *)(((ULONG_PTR)(real_addr +
> > alignment-1)) & ~(alignment - 1));
> > +	if ( aligned_addr < real_addr + desc_size )
> > +		aligned_addr += alignment;
> > +
> > +	// store the descriptor
> > +	*(int*)(aligned_addr - desc_size) = (int)(aligned_addr
> > - real_addr);
> > +	
> > +	*memptr = aligned_addr;
> > +	return 0;
> >  }
> >  
> > +// there is no such POSIX function. Called so to be similar
> > to the allocation one.
> > +static inline void posix_memfree(void *memptr) {
> > +	int *desc_addr = (int*)((char*)memptr - sizeof(int));
> > +	char *real_addr = (char*)memptr - *desc_addr;
> > +	cl_free(real_addr);
> > +}
> > +
> >  // ===========================================
> >  // FUNCTIONS
> >  // ===========================================
> > 
> > 
> > 
> > 
> > > -----Original Message-----
> > > From: ofw-bounces at lists.openfabrics.org 
> > > [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of
> > Leonid Keller
> > > Sent: Tuesday, June 24, 2008 1:14 PM
> > > To: Fab Tillier; ofw at lists.openfabrics.org
> > > Subject: [ofw] RE: [HW] memory allocation improvement in 
> user space
> > > 
> > > See inline
> > > 
> > > > -----Original Message-----
> > > > From: Fab Tillier [mailto:ftillier at windows.microsoft.com]
> > > > Sent: Monday, June 23, 2008 8:55 PM
> > > > To: Leonid Keller; ofw at lists.openfabrics.org
> > > > Subject: RE: [HW] memory allocation improvement in user space
> > > > 
> > > > Hi Leo,
> > > > 
> > > > >From: Leonid Keller
> > > > >Sent: Monday, June 23, 2008 6:44 AM
> > > > >
> > > > >Investigation of INSUFFICIENT_MEMORY failures on our
> > stress tests
> > > > >brought us to "revelation", that VirtualAlloc 
> function, used for 
> > > > >implementation of posix_memalign, is a very "greedy" one: it
> > > > allocates
> > > > >at least 64KB memory.
> > > > >As far as we usually ask one page, it is 16 times more than
> > > > necessary.
> > > > 
> > > > VirtualAlloc doesn't actually allocate 64K of memory, 
> but it does 
> > > > reserve 64K of virtual address space.  It then commits a
> > > single page.  
> > > > This article:
> > > > http://msdn.microsoft.com/en-us/library/ms810627.aspx
> > > > explains this behavior: "The minimum size that can be
> > > reserved is 64K
> > > > ... Requesting one page of reserved addresses results in a
> > > 64K address
> > > > range" in the section about reserving memory.
> > > 
> > > I know that. But using reservation and allocation
> > separately requires
> > > a complicate housekeeping system for one and we do not 
> always need 
> > > allocate integral pages for two.
> > > > 
> > > > >Presented below a patch, which implements posix_memalign with
> > > > >(ultimately) HeapAlloc functions.
> > > > >The patch was tested and worked OK.
> > > > >
> > > > >An important nuance, that was revealed during testing is
> > > as follows:
> > > > >A system function, which releases the resources of an
> > > > exiting process,
> > > > >damages in some way the work of MmSecureVirtualMemory
> > > > function, which
> > > > >we use today to secure CQ\QP\SRQ circular buffers and
> > user buffers.
> > > > >If an application gets killed or exits without releasing the
> > > > resources,
> > > > >IBBUS catches this event, starts its cascading destroy of
> > > > resources and
> > > > >crashes on MmUnsecureVirtualMemory.
> > > > >Putting MmUnsecureVirtualMemory in try-except block
> > saves from the
> > > > >crash, but an async thread, releasing QPs, freezes on 
> > > > >MmUnsecureVirtualMemory, which fails to get some mutex.
> > > > 
> > > > Maybe the first exception didn't release the mutex?
> > > 
> > > Maybe. It's not my mutex. It's theirs, internal one.
> > > The only change that I did (in user space) was to allocate
> > memory in
> > > the Heap and not as separate pages.
> > > MmSecureVirtualMemory/MmUnsecureVirtualMemory pair works on
> > the good
> > > flow, when the resources are released.
> > > MmUnsecureVirtualMemory fails with exception only when 
> application 
> > > exits without releasing the resources.
> > > 
> > > > 
> > > > >As far as there is no real reason to secure circular
> > > buffers, i've
> > > > >solved the problem by skipping securing for IB objects.
> > > > >User buffers are still secured while memory registration.
> > > > >
> > > 
> > > BTW, DDK *doesn't* recommend to use this function.
> > > Its intention is to keep a user buffer from freeing or
> > changing access
> > > rights.
> > > We do not need this functionality for circular buffers anyway.
> > > 
> > > > >Therefore the patch contains 3 kinds of changes:
> > > > >1) new implementation of posix_memalign and all 
> related to that;
> > > > >2) try-except block around MmUnsecureVirtualMemory;
> > > > 
> > > > I'd be weary of just working around the issue without
> > understanding
> > > > why it crashes, especially since you found some negative
> > > ramifications
> > > > to this (the subsequent hang).  What if there's some memory
> > > corruption
> > > > problem in the UVP - shouldn't that be fixed rather than hidden?
> > > > 
> > > > >3) new parameter in ib_umem_get and mthca_reg_virt_mr
> > for skipping
> > > > >memory securing and all related to it;
> > > > 
> > > > -Fab
> > > > 
> > > 
> > > 
> > > --
> > > Leonid Keller
> > > Mellanox Technologies LTD.
> > > SW- Windows
> > > Phone: +972 (4) 909 7200 (ext 372)
> > > Mobile: +972 (54) 464 7702
> > > E-mail: leonid at mellanox.co.il
> > > 
> > > 
> > 
> ----------------------------------------------------------------------
> > > Emails belong on computers, trees belong in forests; if you
> > must print
> > > this, do it on recycled paper.
> > > http://www.greenpeace.org/international/
> > > 
> > 
> ----------------------------------------------------------------------
> > > 
> > > 
> > > Disclaimer added by CodeTwo Exchange Rules http://www.codetwo.com 
> > > _______________________________________________
> > > ofw mailing list
> > > ofw at lists.openfabrics.org
> > > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> > > 



More information about the ofw mailing list