[openib-general] [PATCH] sdp: replace mlock with get_user_pages

Michael S. Tsirkin mst at mellanox.co.il
Thu May 5 04:01:58 PDT 2005


Hello, Libor!
The following patch replaces the mlock hack with call
to get_user_pages. Since the application could have forked
while an iocb is outstanding, when an iocb is done
I do get_user_pages for a second time and copy data if
the physical address has changed.

Thus, changing ulimit is no longer required to get aio
working, processes are also allowed to fork and to call mlock/munlock
on the buffer.

Tested by the ttcp.aio benchmark, and works fine for me.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: ulp/sdp/sdp_send.c
===================================================================
--- ulp/sdp/sdp_send.c	(revision 2235)
+++ ulp/sdp/sdp_send.c	(working copy)
@@ -2197,6 +2202,7 @@ skip: /* entry point for IOCB based tran
 		iocb->req  = req;
 		iocb->key  = req->ki_key;
 		iocb->addr = (unsigned long)msg->msg_iov->iov_base - copied;
+		iocb->is_receive = 0;
       
 		req->ki_cancel = sdp_inet_write_cancel;
 
Index: ulp/sdp/sdp_recv.c
===================================================================
--- ulp/sdp/sdp_recv.c	(revision 2235)
+++ ulp/sdp/sdp_recv.c	(working copy)
@@ -1447,6 +1448,7 @@ int sdp_inet_recv(struct kiocb  *req, st
 			iocb->key  = req->ki_key;
 			iocb->addr = ((unsigned long)msg->msg_iov->iov_base -
 				      copied);
+			iocb->is_receive = 1;
 
 			req->ki_cancel = sdp_inet_read_cancel;
 
Index: ulp/sdp/sdp_iocb.c
===================================================================
--- ulp/sdp/sdp_iocb.c	(revision 2235)
+++ ulp/sdp/sdp_iocb.c	(working copy)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,389 +32,197 @@
  *
  * $Id$
  */
-
+#include <linux/pagemap.h>
 #include "sdp_main.h"
 
 static kmem_cache_t *sdp_iocb_cache = NULL;
 
-/*
- * memory locking functions
- */
-#include <linux/utsname.h>
-
-typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
-static do_mlock_ptr_t mlock_ptr = NULL;
-
-/*
- * do_iocb_unlock - unlock the memory for an IOCB
- */
-static int do_iocb_unlock(struct sdpc_iocb *iocb)
-{
-	struct vm_area_struct *vma;
-
-	vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
-	if (!vma)
-		sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
-			 iocb->addr, iocb->size);
-
-	while (vma) {
-		sdp_dbg_data(NULL, 
-			     "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
-			     iocb->addr, vma, vma->vm_start, vma->vm_end,
-			     vma->vm_flags, (long)vma->vm_private_data);
-		
-		spin_lock(&iocb->mm->page_table_lock);
-		/*
-		 * if there are no more references to the vma
-		 */
-		vma->vm_private_data--;
- 
-		if (!vma->vm_private_data) {
-			/*
-			 * modify VM flags.
-			 */
-			vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
-			/*
-			 * adjust locked page count
-			 */
-			vma->vm_mm->locked_vm -= ((vma->vm_end -
-						   vma->vm_start) >> 
-						  PAGE_SHIFT);
-		}
-
-		spin_unlock(&iocb->mm->page_table_lock);
-		/*
-		 * continue if the buffer continues onto the next vma
-		 */
-		if ((iocb->addr + iocb->size) > vma->vm_end)
-			vma = vma->vm_next;
-		else
-			vma = NULL;
-	}
-
-	return 0;
+static void sdp_copy_one_page(struct page *from, struct page* to, 
+		      unsigned long iocb_addr, size_t iocb_size,
+		      unsigned long uaddr)
+{
+	size_t size_left = iocb_addr + iocb_size - uaddr;
+	size_t size = min(size_left,PAGE_SIZE);
+	unsigned long offset = uaddr % PAGE_SIZE;
+	unsigned long flags;
+
+	void* fptr;
+	void* tptr;
+
+	local_irq_save(flags);
+	fptr = kmap_atomic(from, KM_IRQ0);
+	tptr = kmap_atomic(to, KM_IRQ1);
+
+	memcpy(tptr + offset, fptr + offset, size);
+
+	kunmap_atomic(tptr, KM_IRQ1);
+	kunmap_atomic(fptr, KM_IRQ0);
+	local_irq_restore(flags);
+	set_page_dirty_lock(to);
 }
 
 /*
  * sdp_iocb_unlock - unlock the memory for an IOCB
+ * Copy if pages moved since.
+ * TODO: is this needed?
  */
 int sdp_iocb_unlock(struct sdpc_iocb *iocb)
 {
 	int result;
+	struct page ** pages = NULL;
+	unsigned long uaddr;
+	int i;
+
 
-	/*
-	 * check if IOCB is locked.
-	 */
 	if (!(iocb->flags & SDP_IOCB_F_LOCKED))
 		return 0;
-	/*
-	 * spin lock since this could be from interrupt context.
-	 */
-	down_write(&iocb->mm->mmap_sem);
-	
-	result = do_iocb_unlock(iocb);
 
-	up_write(&iocb->mm->mmap_sem);
+	/* For read, unlock and we are done */
+	if (!iocb->is_receive) {
+		for (i = 0;i < iocb->page_count; ++i)
+			put_page(iocb->page_array[i]);
+		goto done;
+	}
 
-	kfree(iocb->page_array);
-	kfree(iocb->addr_array);
+	/* For write, we must check the virtual pages did not get remapped */
 
-	iocb->page_array = NULL;
-	iocb->addr_array = NULL;
-	iocb->mm = NULL;
-	/*
-	 * mark IOCB unlocked.
-	 */
-	iocb->flags &= ~SDP_IOCB_F_LOCKED;
+	/* As an optimisation (to avoid scanning the vma tree each time),
+	 * try to get all pages in one go. */
+	/* TODO: use cache for allocations? Allocate by chunks? */
 
-	return result;
-}
+	pages = kmalloc((sizeof(struct page *) * iocb->page_count), GFP_KERNEL);
 
-/*
- * sdp_iocb_page_save - save page information for an IOCB
- */
-static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
-{
-	unsigned int counter;
-	unsigned long addr;
-	size_t size;
-	int result = -ENOMEM;
-	struct page *page;
-	unsigned long pfn;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep;
-	pte_t  pte;
+	down_read(&iocb->mm->mmap_sem);
 
-	if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
-		return -EINVAL;
-	/*
-	 * create array to hold page value which are later needed to register
-	 * the buffer with the HCA
-	 */
-	iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
-				   GFP_KERNEL);
-	if (!iocb->addr_array)
-		goto err_addr;
+	if (pages) {
+		result=get_user_pages(iocb->tsk, iocb->mm,
+				      iocb->addr,
+				      iocb->page_count , iocb->is_receive, 0,
+				      pages, NULL);
 
-	iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
-				   GFP_KERNEL);
-	if (!iocb->page_array)
-		goto err_page;
-	/*
-	 * iocb->addr - buffer start address
-	 * iocb->size - buffer length
-	 * addr       - page aligned
-	 * size       - page multiple
-	 */
-	addr = iocb->addr & PAGE_MASK;
-	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
+		if (result != iocb->page_count) {
+			kfree(pages);
+			pages = NULL;
+		}
+	}
 
-	iocb->page_offset = iocb->addr - addr;
-	/*
-	 * Find pages used within the buffer which will then be registered
-	 * for RDMA
-	 */
-	spin_lock(&iocb->mm->page_table_lock);
+	for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
+	     ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
+	{
+		struct page* page;
+		set_page_dirty_lock(iocb->page_array[i]);
+
+		if (pages)
+			page = pages[i];
+		else {
+			result=get_user_pages(iocb->tsk, iocb->mm,
+					      uaddr & PAGE_MASK,
+					      1 , 1, 0, &page, NULL);
+			if (result != 1) {
+				page = NULL;
+			}
+		}
 
-	for (counter = 0;
-	     size > 0;
-	     counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
-		pgd = pgd_offset_gate(iocb->mm, addr);
-		if (!pgd || pgd_none(*pgd))
-			break;
-
-		pud = pud_offset(pgd, addr);
-		if (!pud || pud_none(*pud))
-			break;
-		
-		pmd = pmd_offset(pud, addr);
-		if (!pmd || pmd_none(*pmd))
-			break;
-
-		ptep = pte_offset_map(pmd, addr);
-		if (!ptep) 
-			break;
-
-		pte = *ptep;
-		pte_unmap(ptep);
-
-		if (!pte_present(pte))
-			break;
-
-		pfn = pte_pfn(pte);
-		if (!pfn_valid(pfn))
-			break;
-        
-		page = pfn_to_page(pfn);
+		if (page && iocb->page_array[i] != page)
+			sdp_copy_one_page(iocb->page_array[i], page,
+					  iocb->addr, iocb->size, uaddr);
 
-		iocb->page_array[counter] = page;
-		iocb->addr_array[counter] = page_to_phys(page);
+		if (page)
+			put_page(page);
+		put_page(iocb->page_array[i]);
 	}
 
-	spin_unlock(&iocb->mm->page_table_lock);
-	
-	if (size > 0) {
-		result = -EFAULT;
-		goto err_find;
-	}
+	up_read(&iocb->mm->mmap_sem);
 
-	return 0;
-err_find:
-	
-	kfree(iocb->page_array);
-	iocb->page_array = NULL;
-err_page:
+	if (pages)
+		kfree(pages);
 
+done:
+	kfree(iocb->page_array);
 	kfree(iocb->addr_array);
+
+	iocb->page_array = NULL;
 	iocb->addr_array = NULL;
-err_addr:
+	iocb->mm = NULL;
+	iocb->tsk = NULL;
 
-	return result;
+	iocb->flags &= ~SDP_IOCB_F_LOCKED;
+
+	return 0;
 }
 
 /*
  * sdp_iocb_lock - lock the memory for an IOCB
+ * We do not take a reference on the mm, AIO handles this for us.
  */
 int sdp_iocb_lock(struct sdpc_iocb *iocb)
 {
-	struct vm_area_struct *vma;
-	kernel_cap_t real_cap;
-	unsigned long limit;
 	int result = -ENOMEM;
 	unsigned long addr;
-	size_t        size;
-	
-	/*
-	 * mark IOCB as locked. We do not take a reference on the mm, AIO
-	 * handles this for us.
-	 */
-	iocb->flags |= SDP_IOCB_F_LOCKED;
-	iocb->mm     = current->mm;
+	size_t size;
+	int i;
 	/*
-	 * save and raise capabilities
+	 * iocb->addr - buffer start address
+	 * iocb->size - buffer length
+	 * addr       - page aligned
+	 * size       - page multiple
 	 */
-	real_cap = cap_t(current->cap_effective);
-	cap_raise(current->cap_effective, CAP_IPC_LOCK);
-  
-	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
 	addr = iocb->addr & PAGE_MASK;
+	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
 
+	iocb->page_offset = iocb->addr - addr;
+	
 	iocb->page_count = size >> PAGE_SHIFT;
-
-	limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-	limit >>= PAGE_SHIFT;
-	/*
-	 * lock the mm, if within the limit lock the address range.
-	 */
-	down_write(&iocb->mm->mmap_sem);
-
-	if (!((iocb->page_count + current->mm->locked_vm) > limit))
-		result = (*mlock_ptr)(addr, size, 1);
 	/*
-	 * process result
-	 */
-	if (result) {
-		sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
-			    iocb->addr, iocb->size, result, 
-			    iocb->page_count, iocb->mm->locked_vm, limit);
-		goto err_lock;
-	}
-	/*
-	 * look up the head of the vma queue, loop through the vmas, marking
-	 * them do not copy, reference counting, and saving them. 
+	 * create array to hold page value which are later needed to register
+	 * the buffer with the HCA
 	 */
-	vma = find_vma(iocb->mm, addr);
-	if (!vma)
-		/*
-		 * sanity check.
-		 */
-		sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
-			 iocb->addr, iocb->size);
 
-	while (vma) {
-		spin_lock(&iocb->mm->page_table_lock);
+	/* TODO: use cache for allocations? Allocate by chunks? */
+	iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
+				   GFP_KERNEL);
+	if (!iocb->addr_array)
+		goto err_addr;
 
-		if (!(VM_LOCKED & vma->vm_flags))
-			sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
+	iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
+				   GFP_KERNEL);
+	if (!iocb->page_array)
+		goto err_page;
 
-		if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
-			sdp_dbg_err("VMA: private daya in use! <%08lx>",
-				    (unsigned long)vma->vm_private_data);
-		
-		vma->vm_flags |= VM_DONTCOPY;
-		vma->vm_private_data++;
+	down_read(&current->mm->mmap_sem);
 
-		spin_unlock(&iocb->mm->page_table_lock);
+        result=get_user_pages(current, current->mm, iocb->addr,
+			      iocb->page_count , iocb->is_receive, 0,
+			      iocb->page_array, NULL);
+
+	up_read(&current->mm->mmap_sem);
+
+	if (result != iocb->page_count) {
+		sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
+			    iocb->addr, iocb->size, result, iocb->page_count);
+		goto err_get;
+	}
 
-		sdp_dbg_data(NULL, 
-			     "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
-			     iocb->addr, vma, vma->vm_start, vma->vm_end,
-			     vma->vm_flags, (long)vma->vm_private_data);
+	iocb->flags |= SDP_IOCB_F_LOCKED;
+	iocb->mm     = current->mm;
+	iocb->tsk    = current;
 
-		if ((addr + size) > vma->vm_end)
-			vma = vma->vm_next;
-		else
-			vma = NULL;
-	}
 
-	result = sdp_iocb_page_save(iocb);
-	if (result) {
-		sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>", 
-			    result, iocb->addr, iocb->size);
-		goto err_save;
+	for (i = 0; i< iocb->page_count; ++i) {
+		iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
 	}
 
-	up_write(&iocb->mm->mmap_sem);
-	cap_t(current->cap_effective) = real_cap;
-
 	return 0;
-err_save:
-
-	(void)do_iocb_unlock(iocb);
-err_lock:
-	/*
-	 * unlock the mm and restore capabilities.
-	 */
-	up_write(&iocb->mm->mmap_sem);
-	cap_t(current->cap_effective) = real_cap;
-
-	iocb->flags &= ~SDP_IOCB_F_LOCKED;
-	iocb->mm = NULL;
 
+err_get:
+	kfree(iocb->page_array);
+err_page:
+	kfree(iocb->addr_array);
+err_addr:
 	return result;
 }
 
 /*
- * IOCB memory locking init functions
- */
-struct kallsym_iter {
-	loff_t pos;
-	struct module *owner;
-	unsigned long value;
-	unsigned int nameoff; /* If iterating in core kernel symbols */
-	char type;
-	char name[128];
-};
-
-/*
- * sdp_mem_lock_init - initialize the userspace memory locking
- */
-static int sdp_mem_lock_init(void)
-{
-	struct file *kallsyms;
-	struct seq_file *seq;
-	struct kallsym_iter *iter;
-	loff_t pos = 0;
-	int ret = -EINVAL;
-	
-	sdp_dbg_init("Memory Locking initialization.");
-	
-	kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
-	if (!kallsyms) {
-		sdp_warn("Failed to open /proc/kallsyms");
-		goto done;
-	}
-
-	seq = (struct seq_file *)kallsyms->private_data;
-	if (!seq) {
-		sdp_warn("Failed to fetch sequential file.");
-		goto err_close;
-	}
-
-	for (iter = seq->op->start(seq, &pos);
-	     iter != NULL;
-	     iter = seq->op->next(seq, iter, &pos))
-		if (!strcmp(iter->name, "do_mlock"))
-			mlock_ptr = (do_mlock_ptr_t)iter->value;
-
-	if (!mlock_ptr)
-		sdp_warn("Failed to find lock pointer.");
-	else
-		ret = 0;
-
-err_close:
-	filp_close(kallsyms, NULL);
-done:
-	return ret;
-}
-
-/*
- * sdp_mem_lock_cleanup - cleanup the memory locking tables
- */
-static int sdp_mem_lock_cleanup(void)
-{
-	sdp_dbg_init("Memory Locking cleanup.");
-	/*
-	 * null out entries.
-	 */
-	mlock_ptr = NULL;
-	
-	return 0;
-}
-
-/*
  * IOCB memory registration functions
  */
 
@@ -831,28 +640,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
 }
 
 /*
- * primary initialization/cleanup functions
- */
-
-/*
  * sdp_main_iocb_init - initialize the advertisment caches
  */
 int sdp_main_iocb_init(void)
 {
-	int result;
-
 	sdp_dbg_init("IOCB cache initialization.");
-	/*
-	 * initialize locking code.
-	 */
-	result =  sdp_mem_lock_init();
-	if (result < 0) {
-		sdp_warn("Error <%d> initializing memory locking.", result);
-		return result;
-	}
-	/*
-	 * initialize the caches only once.
-	 */
+
 	if (sdp_iocb_cache) {
 		sdp_warn("IOCB caches already initialized.");
 		return -EINVAL;
@@ -862,15 +655,10 @@ int sdp_main_iocb_init(void)
 					     sizeof(struct sdpc_iocb),
 					     0, SLAB_HWCACHE_ALIGN, NULL,
 					     NULL);
-	if (!sdp_iocb_cache) {
-		result = -ENOMEM;
-		goto error_iocb_c;
-	}
+	if (!sdp_iocb_cache)
+		return -ENOMEM;
 
 	return 0;
-error_iocb_c:
-	(void)sdp_mem_lock_cleanup();
-	return result;
 }
 
 /*
@@ -879,16 +667,6 @@ error_iocb_c:
 void sdp_main_iocb_cleanup(void)
 {
 	sdp_dbg_init("IOCB cache cleanup.");
-	/*
-	 * cleanup the caches
-	 */
 	kmem_cache_destroy(sdp_iocb_cache);
-	/*
-	 * null out entries.
-	 */
 	sdp_iocb_cache = NULL;
-	/*
-	 * cleanup memory locking
-	 */
-	(void)sdp_mem_lock_cleanup();
 }
Index: ulp/sdp/sdp_iocb.h
===================================================================
--- ulp/sdp/sdp_iocb.h	(revision 2235)
+++ ulp/sdp/sdp_iocb.h	(working copy)
@@ -99,9 +99,11 @@ struct sdpc_iocb {
 	/*
 	 * page list. data for locking/registering userspace
 	 */
-	struct mm_struct *mm;      /* user mm struct */
-	unsigned long     addr;    /* user space address */
-	size_t            size;    /* total size of the user buffer */
+	struct mm_struct   *mm;      /* user mm struct */
+	struct task_struct *tsk;
+	unsigned long       addr;    /* user space address */
+	size_t              size;    /* total size of the user buffer */
+	int		    is_receive;
 
 	struct page **page_array;  /* list of page structure pointers. */
 	u64          *addr_array;  /* list of physical page addresses. */


-- 
MST - Michael S. Tsirkin



More information about the general mailing list