[openib-general] Re: [PATCH] sdp: replace mlock with get_user_pages

Libor Michalek libor at topspin.com
Fri May 6 17:54:15 PDT 2005


On Thu, May 05, 2005 at 02:01:58PM +0300, Michael S. Tsirkin wrote:
> Hello, Libor!
> The following patch replaces the mlock hack with call
> to get_user_pages. Since the application could have forked
> while an iocb is outstanding, when an iocb is done
> I do get_user_pages for a second time and copy data if
> the physical address has changed.
> 
> Thus, changing ulimit is no longer required to get aio
> working, processes are also allowed to fork and to call mlock/munlock
> on the buffer.
> 
> Tested by the ttcp.aio benchmark, and works fine for me.

  In the latest kernel what happens to a page that's held
with a get_user_page reference on fork? Which is the only
case left. Since there is an iocv flag parameter it would
be prefered to use a bit there instead of is_receive. Also,
please leave spaces around '='. Finally it would be nice
if we could get rid of having to schedule a task to unlock
the iocb in complete, I've noticed this having a negative
effect on system CPU utilization on older kernels.

-Libor

> Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
> 
> Index: ulp/sdp/sdp_send.c
> ===================================================================
> --- ulp/sdp/sdp_send.c	(revision 2235)
> +++ ulp/sdp/sdp_send.c	(working copy)
> @@ -2197,6 +2202,7 @@ skip: /* entry point for IOCB based tran
>  		iocb->req  = req;
>  		iocb->key  = req->ki_key;
>  		iocb->addr = (unsigned long)msg->msg_iov->iov_base - copied;
> +		iocb->is_receive = 0;
>        
>  		req->ki_cancel = sdp_inet_write_cancel;
>  
> Index: ulp/sdp/sdp_recv.c
> ===================================================================
> --- ulp/sdp/sdp_recv.c	(revision 2235)
> +++ ulp/sdp/sdp_recv.c	(working copy)
> @@ -1447,6 +1448,7 @@ int sdp_inet_recv(struct kiocb  *req, st
>  			iocb->key  = req->ki_key;
>  			iocb->addr = ((unsigned long)msg->msg_iov->iov_base -
>  				      copied);
> +			iocb->is_receive = 1;
>  
>  			req->ki_cancel = sdp_inet_read_cancel;
>  
> Index: ulp/sdp/sdp_iocb.c
> ===================================================================
> --- ulp/sdp/sdp_iocb.c	(revision 2235)
> +++ ulp/sdp/sdp_iocb.c	(working copy)
> @@ -1,5 +1,6 @@
>  /*
>   * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
>   *
>   * This software is available to you under a choice of one of two
>   * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -31,389 +32,197 @@
>   *
>   * $Id$
>   */
> -
> +#include <linux/pagemap.h>
>  #include "sdp_main.h"
>  
>  static kmem_cache_t *sdp_iocb_cache = NULL;
>  
> -/*
> - * memory locking functions
> - */
> -#include <linux/utsname.h>
> -
> -typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
> -static do_mlock_ptr_t mlock_ptr = NULL;
> -
> -/*
> - * do_iocb_unlock - unlock the memory for an IOCB
> - */
> -static int do_iocb_unlock(struct sdpc_iocb *iocb)
> -{
> -	struct vm_area_struct *vma;
> -
> -	vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
> -	if (!vma)
> -		sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
> -			 iocb->addr, iocb->size);
> -
> -	while (vma) {
> -		sdp_dbg_data(NULL, 
> -			     "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
> -			     iocb->addr, vma, vma->vm_start, vma->vm_end,
> -			     vma->vm_flags, (long)vma->vm_private_data);
> -		
> -		spin_lock(&iocb->mm->page_table_lock);
> -		/*
> -		 * if there are no more references to the vma
> -		 */
> -		vma->vm_private_data--;
> - 
> -		if (!vma->vm_private_data) {
> -			/*
> -			 * modify VM flags.
> -			 */
> -			vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
> -			/*
> -			 * adjust locked page count
> -			 */
> -			vma->vm_mm->locked_vm -= ((vma->vm_end -
> -						   vma->vm_start) >> 
> -						  PAGE_SHIFT);
> -		}
> -
> -		spin_unlock(&iocb->mm->page_table_lock);
> -		/*
> -		 * continue if the buffer continues onto the next vma
> -		 */
> -		if ((iocb->addr + iocb->size) > vma->vm_end)
> -			vma = vma->vm_next;
> -		else
> -			vma = NULL;
> -	}
> -
> -	return 0;
> +static void sdp_copy_one_page(struct page *from, struct page* to, 
> +		      unsigned long iocb_addr, size_t iocb_size,
> +		      unsigned long uaddr)
> +{
> +	size_t size_left = iocb_addr + iocb_size - uaddr;
> +	size_t size = min(size_left,PAGE_SIZE);
> +	unsigned long offset = uaddr % PAGE_SIZE;
> +	unsigned long flags;
> +
> +	void* fptr;
> +	void* tptr;
> +
> +	local_irq_save(flags);
> +	fptr = kmap_atomic(from, KM_IRQ0);
> +	tptr = kmap_atomic(to, KM_IRQ1);
> +
> +	memcpy(tptr + offset, fptr + offset, size);
> +
> +	kunmap_atomic(tptr, KM_IRQ1);
> +	kunmap_atomic(fptr, KM_IRQ0);
> +	local_irq_restore(flags);
> +	set_page_dirty_lock(to);
>  }
>  
>  /*
>   * sdp_iocb_unlock - unlock the memory for an IOCB
> + * Copy if pages moved since.
> + * TODO: is this needed?
>   */
>  int sdp_iocb_unlock(struct sdpc_iocb *iocb)
>  {
>  	int result;
> +	struct page ** pages = NULL;
> +	unsigned long uaddr;
> +	int i;
> +
>  
> -	/*
> -	 * check if IOCB is locked.
> -	 */
>  	if (!(iocb->flags & SDP_IOCB_F_LOCKED))
>  		return 0;
> -	/*
> -	 * spin lock since this could be from interrupt context.
> -	 */
> -	down_write(&iocb->mm->mmap_sem);
> -	
> -	result = do_iocb_unlock(iocb);
>  
> -	up_write(&iocb->mm->mmap_sem);
> +	/* For read, unlock and we are done */
> +	if (!iocb->is_receive) {
> +		for (i = 0;i < iocb->page_count; ++i)
> +			put_page(iocb->page_array[i]);
> +		goto done;
> +	}
>  
> -	kfree(iocb->page_array);
> -	kfree(iocb->addr_array);
> +	/* For write, we must check the virtual pages did not get remapped */
>  
> -	iocb->page_array = NULL;
> -	iocb->addr_array = NULL;
> -	iocb->mm = NULL;
> -	/*
> -	 * mark IOCB unlocked.
> -	 */
> -	iocb->flags &= ~SDP_IOCB_F_LOCKED;
> +	/* As an optimisation (to avoid scanning the vma tree each time),
> +	 * try to get all pages in one go. */
> +	/* TODO: use cache for allocations? Allocate by chunks? */
>  
> -	return result;
> -}
> +	pages = kmalloc((sizeof(struct page *) * iocb->page_count), GFP_KERNEL);
>  
> -/*
> - * sdp_iocb_page_save - save page information for an IOCB
> - */
> -static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
> -{
> -	unsigned int counter;
> -	unsigned long addr;
> -	size_t size;
> -	int result = -ENOMEM;
> -	struct page *page;
> -	unsigned long pfn;
> -	pgd_t *pgd;
> -	pud_t *pud;
> -	pmd_t *pmd;
> -	pte_t *ptep;
> -	pte_t  pte;
> +	down_read(&iocb->mm->mmap_sem);
>  
> -	if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
> -		return -EINVAL;
> -	/*
> -	 * create array to hold page value which are later needed to register
> -	 * the buffer with the HCA
> -	 */
> -	iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
> -				   GFP_KERNEL);
> -	if (!iocb->addr_array)
> -		goto err_addr;
> +	if (pages) {
> +		result=get_user_pages(iocb->tsk, iocb->mm,
> +				      iocb->addr,
> +				      iocb->page_count , iocb->is_receive, 0,
> +				      pages, NULL);
>  
> -	iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
> -				   GFP_KERNEL);
> -	if (!iocb->page_array)
> -		goto err_page;
> -	/*
> -	 * iocb->addr - buffer start address
> -	 * iocb->size - buffer length
> -	 * addr       - page aligned
> -	 * size       - page multiple
> -	 */
> -	addr = iocb->addr & PAGE_MASK;
> -	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
> +		if (result != iocb->page_count) {
> +			kfree(pages);
> +			pages = NULL;
> +		}
> +	}
>  
> -	iocb->page_offset = iocb->addr - addr;
> -	/*
> -	 * Find pages used within the buffer which will then be registered
> -	 * for RDMA
> -	 */
> -	spin_lock(&iocb->mm->page_table_lock);
> +	for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
> +	     ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
> +	{
> +		struct page* page;
> +		set_page_dirty_lock(iocb->page_array[i]);
> +
> +		if (pages)
> +			page = pages[i];
> +		else {
> +			result=get_user_pages(iocb->tsk, iocb->mm,
> +					      uaddr & PAGE_MASK,
> +					      1 , 1, 0, &page, NULL);
> +			if (result != 1) {
> +				page = NULL;
> +			}
> +		}
>  
> -	for (counter = 0;
> -	     size > 0;
> -	     counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
> -		pgd = pgd_offset_gate(iocb->mm, addr);
> -		if (!pgd || pgd_none(*pgd))
> -			break;
> -
> -		pud = pud_offset(pgd, addr);
> -		if (!pud || pud_none(*pud))
> -			break;
> -		
> -		pmd = pmd_offset(pud, addr);
> -		if (!pmd || pmd_none(*pmd))
> -			break;
> -
> -		ptep = pte_offset_map(pmd, addr);
> -		if (!ptep) 
> -			break;
> -
> -		pte = *ptep;
> -		pte_unmap(ptep);
> -
> -		if (!pte_present(pte))
> -			break;
> -
> -		pfn = pte_pfn(pte);
> -		if (!pfn_valid(pfn))
> -			break;
> -        
> -		page = pfn_to_page(pfn);
> +		if (page && iocb->page_array[i] != page)
> +			sdp_copy_one_page(iocb->page_array[i], page,
> +					  iocb->addr, iocb->size, uaddr);
>  
> -		iocb->page_array[counter] = page;
> -		iocb->addr_array[counter] = page_to_phys(page);
> +		if (page)
> +			put_page(page);
> +		put_page(iocb->page_array[i]);
>  	}
>  
> -	spin_unlock(&iocb->mm->page_table_lock);
> -	
> -	if (size > 0) {
> -		result = -EFAULT;
> -		goto err_find;
> -	}
> +	up_read(&iocb->mm->mmap_sem);
>  
> -	return 0;
> -err_find:
> -	
> -	kfree(iocb->page_array);
> -	iocb->page_array = NULL;
> -err_page:
> +	if (pages)
> +		kfree(pages);
>  
> +done:
> +	kfree(iocb->page_array);
>  	kfree(iocb->addr_array);
> +
> +	iocb->page_array = NULL;
>  	iocb->addr_array = NULL;
> -err_addr:
> +	iocb->mm = NULL;
> +	iocb->tsk = NULL;
>  
> -	return result;
> +	iocb->flags &= ~SDP_IOCB_F_LOCKED;
> +
> +	return 0;
>  }
>  
>  /*
>   * sdp_iocb_lock - lock the memory for an IOCB
> + * We do not take a reference on the mm, AIO handles this for us.
>   */
>  int sdp_iocb_lock(struct sdpc_iocb *iocb)
>  {
> -	struct vm_area_struct *vma;
> -	kernel_cap_t real_cap;
> -	unsigned long limit;
>  	int result = -ENOMEM;
>  	unsigned long addr;
> -	size_t        size;
> -	
> -	/*
> -	 * mark IOCB as locked. We do not take a reference on the mm, AIO
> -	 * handles this for us.
> -	 */
> -	iocb->flags |= SDP_IOCB_F_LOCKED;
> -	iocb->mm     = current->mm;
> +	size_t size;
> +	int i;
>  	/*
> -	 * save and raise capabilities
> +	 * iocb->addr - buffer start address
> +	 * iocb->size - buffer length
> +	 * addr       - page aligned
> +	 * size       - page multiple
>  	 */
> -	real_cap = cap_t(current->cap_effective);
> -	cap_raise(current->cap_effective, CAP_IPC_LOCK);
> -  
> -	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
>  	addr = iocb->addr & PAGE_MASK;
> +	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
>  
> +	iocb->page_offset = iocb->addr - addr;
> +	
>  	iocb->page_count = size >> PAGE_SHIFT;
> -
> -	limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
> -	limit >>= PAGE_SHIFT;
> -	/*
> -	 * lock the mm, if within the limit lock the address range.
> -	 */
> -	down_write(&iocb->mm->mmap_sem);
> -
> -	if (!((iocb->page_count + current->mm->locked_vm) > limit))
> -		result = (*mlock_ptr)(addr, size, 1);
>  	/*
> -	 * process result
> -	 */
> -	if (result) {
> -		sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
> -			    iocb->addr, iocb->size, result, 
> -			    iocb->page_count, iocb->mm->locked_vm, limit);
> -		goto err_lock;
> -	}
> -	/*
> -	 * look up the head of the vma queue, loop through the vmas, marking
> -	 * them do not copy, reference counting, and saving them. 
> +	 * create array to hold page value which are later needed to register
> +	 * the buffer with the HCA
>  	 */
> -	vma = find_vma(iocb->mm, addr);
> -	if (!vma)
> -		/*
> -		 * sanity check.
> -		 */
> -		sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
> -			 iocb->addr, iocb->size);
>  
> -	while (vma) {
> -		spin_lock(&iocb->mm->page_table_lock);
> +	/* TODO: use cache for allocations? Allocate by chunks? */
> +	iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
> +				   GFP_KERNEL);
> +	if (!iocb->addr_array)
> +		goto err_addr;
>  
> -		if (!(VM_LOCKED & vma->vm_flags))
> -			sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
> +	iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
> +				   GFP_KERNEL);
> +	if (!iocb->page_array)
> +		goto err_page;
>  
> -		if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
> -			sdp_dbg_err("VMA: private daya in use! <%08lx>",
> -				    (unsigned long)vma->vm_private_data);
> -		
> -		vma->vm_flags |= VM_DONTCOPY;
> -		vma->vm_private_data++;
> +	down_read(&current->mm->mmap_sem);
>  
> -		spin_unlock(&iocb->mm->page_table_lock);
> +        result=get_user_pages(current, current->mm, iocb->addr,
> +			      iocb->page_count , iocb->is_receive, 0,
> +			      iocb->page_array, NULL);
> +
> +	up_read(&current->mm->mmap_sem);
> +
> +	if (result != iocb->page_count) {
> +		sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
> +			    iocb->addr, iocb->size, result, iocb->page_count);
> +		goto err_get;
> +	}
>  
> -		sdp_dbg_data(NULL, 
> -			     "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
> -			     iocb->addr, vma, vma->vm_start, vma->vm_end,
> -			     vma->vm_flags, (long)vma->vm_private_data);
> +	iocb->flags |= SDP_IOCB_F_LOCKED;
> +	iocb->mm     = current->mm;
> +	iocb->tsk    = current;
>  
> -		if ((addr + size) > vma->vm_end)
> -			vma = vma->vm_next;
> -		else
> -			vma = NULL;
> -	}
>  
> -	result = sdp_iocb_page_save(iocb);
> -	if (result) {
> -		sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>", 
> -			    result, iocb->addr, iocb->size);
> -		goto err_save;
> +	for (i = 0; i< iocb->page_count; ++i) {
> +		iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
>  	}
>  
> -	up_write(&iocb->mm->mmap_sem);
> -	cap_t(current->cap_effective) = real_cap;
> -
>  	return 0;
> -err_save:
> -
> -	(void)do_iocb_unlock(iocb);
> -err_lock:
> -	/*
> -	 * unlock the mm and restore capabilities.
> -	 */
> -	up_write(&iocb->mm->mmap_sem);
> -	cap_t(current->cap_effective) = real_cap;
> -
> -	iocb->flags &= ~SDP_IOCB_F_LOCKED;
> -	iocb->mm = NULL;
>  
> +err_get:
> +	kfree(iocb->page_array);
> +err_page:
> +	kfree(iocb->addr_array);
> +err_addr:
>  	return result;
>  }
>  
>  /*
> - * IOCB memory locking init functions
> - */
> -struct kallsym_iter {
> -	loff_t pos;
> -	struct module *owner;
> -	unsigned long value;
> -	unsigned int nameoff; /* If iterating in core kernel symbols */
> -	char type;
> -	char name[128];
> -};
> -
> -/*
> - * sdp_mem_lock_init - initialize the userspace memory locking
> - */
> -static int sdp_mem_lock_init(void)
> -{
> -	struct file *kallsyms;
> -	struct seq_file *seq;
> -	struct kallsym_iter *iter;
> -	loff_t pos = 0;
> -	int ret = -EINVAL;
> -	
> -	sdp_dbg_init("Memory Locking initialization.");
> -	
> -	kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
> -	if (!kallsyms) {
> -		sdp_warn("Failed to open /proc/kallsyms");
> -		goto done;
> -	}
> -
> -	seq = (struct seq_file *)kallsyms->private_data;
> -	if (!seq) {
> -		sdp_warn("Failed to fetch sequential file.");
> -		goto err_close;
> -	}
> -
> -	for (iter = seq->op->start(seq, &pos);
> -	     iter != NULL;
> -	     iter = seq->op->next(seq, iter, &pos))
> -		if (!strcmp(iter->name, "do_mlock"))
> -			mlock_ptr = (do_mlock_ptr_t)iter->value;
> -
> -	if (!mlock_ptr)
> -		sdp_warn("Failed to find lock pointer.");
> -	else
> -		ret = 0;
> -
> -err_close:
> -	filp_close(kallsyms, NULL);
> -done:
> -	return ret;
> -}
> -
> -/*
> - * sdp_mem_lock_cleanup - cleanup the memory locking tables
> - */
> -static int sdp_mem_lock_cleanup(void)
> -{
> -	sdp_dbg_init("Memory Locking cleanup.");
> -	/*
> -	 * null out entries.
> -	 */
> -	mlock_ptr = NULL;
> -	
> -	return 0;
> -}
> -
> -/*
>   * IOCB memory registration functions
>   */
>  
> @@ -831,28 +640,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
>  }
>  
>  /*
> - * primary initialization/cleanup functions
> - */
> -
> -/*
>   * sdp_main_iocb_init - initialize the advertisment caches
>   */
>  int sdp_main_iocb_init(void)
>  {
> -	int result;
> -
>  	sdp_dbg_init("IOCB cache initialization.");
> -	/*
> -	 * initialize locking code.
> -	 */
> -	result =  sdp_mem_lock_init();
> -	if (result < 0) {
> -		sdp_warn("Error <%d> initializing memory locking.", result);
> -		return result;
> -	}
> -	/*
> -	 * initialize the caches only once.
> -	 */
> +
>  	if (sdp_iocb_cache) {
>  		sdp_warn("IOCB caches already initialized.");
>  		return -EINVAL;
> @@ -862,15 +655,10 @@ int sdp_main_iocb_init(void)
>  					     sizeof(struct sdpc_iocb),
>  					     0, SLAB_HWCACHE_ALIGN, NULL,
>  					     NULL);
> -	if (!sdp_iocb_cache) {
> -		result = -ENOMEM;
> -		goto error_iocb_c;
> -	}
> +	if (!sdp_iocb_cache)
> +		return -ENOMEM;
>  
>  	return 0;
> -error_iocb_c:
> -	(void)sdp_mem_lock_cleanup();
> -	return result;
>  }
>  
>  /*
> @@ -879,16 +667,6 @@ error_iocb_c:
>  void sdp_main_iocb_cleanup(void)
>  {
>  	sdp_dbg_init("IOCB cache cleanup.");
> -	/*
> -	 * cleanup the caches
> -	 */
>  	kmem_cache_destroy(sdp_iocb_cache);
> -	/*
> -	 * null out entries.
> -	 */
>  	sdp_iocb_cache = NULL;
> -	/*
> -	 * cleanup memory locking
> -	 */
> -	(void)sdp_mem_lock_cleanup();
>  }
> Index: ulp/sdp/sdp_iocb.h
> ===================================================================
> --- ulp/sdp/sdp_iocb.h	(revision 2235)
> +++ ulp/sdp/sdp_iocb.h	(working copy)
> @@ -99,9 +99,11 @@ struct sdpc_iocb {
>  	/*
>  	 * page list. data for locking/registering userspace
>  	 */
> -	struct mm_struct *mm;      /* user mm struct */
> -	unsigned long     addr;    /* user space address */
> -	size_t            size;    /* total size of the user buffer */
> +	struct mm_struct   *mm;      /* user mm struct */
> +	struct task_struct *tsk;
> +	unsigned long       addr;    /* user space address */
> +	size_t              size;    /* total size of the user buffer */
> +	int		    is_receive;
>  
>  	struct page **page_array;  /* list of page structure pointers. */
>  	u64          *addr_array;  /* list of physical page addresses. */
> 
> 
> -- 
> MST - Michael S. Tsirkin



More information about the general mailing list