[openib-general] [PATCH] sdp: replace mlock with get_user_pages

Michael S. Tsirkin mst at mellanox.co.il
Thu Jul 21 00:51:54 PDT 2005


Hello, Libor!
Here's a patch updated against the latest bits.
I'd like it to get merged, if possible.

The existing code play with VMA's is the #1 source of sdp problems for me.
The patch below works with no problems for me, so far.

I am aware of the aio/synchronous ordering race we currently have, and I am
testing a patch for it, but please note the patch below doesnt make this problem
any worse, so I consider it an orthogonal issue.

There are also some additional optimisation ideas, like completing send iocbs
in IRQ context - again, what I have here is the simplest version,
and it performs at least as good as the mlock hack.

Please comment.

---

The following patch replaces the mlock hack with call
to get_user_pages. Since the application could have forked
while an iocb is outstanding, when an iocb is done
I do get_user_pages for a second time and copy data if
the physical address has changed.

Thus, changing ulimit is no longer required to get aio
working, processes are also allowed to fork and to call mlock/munlock
on the buffer.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: linux-2.6.12.2/drivers/infiniband/ulp/sdp/sdp_recv.c
===================================================================
--- linux-2.6.12.2.orig/drivers/infiniband/ulp/sdp/sdp_recv.c
+++ linux-2.6.12.2/drivers/infiniband/ulp/sdp/sdp_recv.c
@@ -1347,6 +1347,8 @@ int sdp_inet_recv(struct kiocb  *req, st
 			iocb->addr = ((unsigned long)msg->msg_iov->iov_base -
 				      copied);
 
+			iocb->flags |= SDP_IOCB_F_RECV;
+
 			req->ki_cancel = sdp_inet_read_cancel;
 
 			result = sdp_iocb_lock(iocb);
Index: linux-2.6.12.2/drivers/infiniband/ulp/sdp/sdp_iocb.c
===================================================================
--- linux-2.6.12.2.orig/drivers/infiniband/ulp/sdp/sdp_iocb.c
+++ linux-2.6.12.2/drivers/infiniband/ulp/sdp/sdp_iocb.c
@@ -33,377 +33,182 @@
  * $Id: sdp_iocb.c 2762 2005-06-30 19:01:38Z libor $
  */
 
+#include <linux/pagemap.h>
 #include "sdp_main.h"
 
 static kmem_cache_t *sdp_iocb_cache = NULL;
 
-/*
- * memory locking functions
- */
-#include <linux/utsname.h>
-
-typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
-static do_mlock_ptr_t mlock_ptr = NULL;
-
-/*
- * do_iocb_unlock - unlock the memory for an IOCB
- */
-static void do_iocb_unlock(struct sdpc_iocb *iocb)
-{
-	struct vm_area_struct *vma;
-
-	vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
-	if (!vma)
-		sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
-			 iocb->addr, iocb->size);
-
-	while (vma) {
-		sdp_dbg_data(NULL, 
-			     "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
-			     iocb->addr, vma, vma->vm_start, vma->vm_end,
-			     vma->vm_flags, (long)vma->vm_private_data);
-		
-		spin_lock(&iocb->mm->page_table_lock);
-		/*
-		 * if there are no more references to the vma
-		 */
-		vma->vm_private_data--;
- 
-		if (!vma->vm_private_data) {
-			/*
-			 * modify VM flags.
-			 */
-			vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
-			/*
-			 * adjust locked page count
-			 */
-			vma->vm_mm->locked_vm -= ((vma->vm_end -
-						   vma->vm_start) >> 
-						  PAGE_SHIFT);
-		}
-
-		spin_unlock(&iocb->mm->page_table_lock);
-		/*
-		 * continue if the buffer continues onto the next vma
-		 */
-		if ((iocb->addr + iocb->size) > vma->vm_end)
-			vma = vma->vm_next;
-		else
-			vma = NULL;
-	}
+static void sdp_copy_one_page(struct page *from, struct page* to, 
+		      unsigned long iocb_addr, size_t iocb_size,
+		      unsigned long uaddr)
+{
+	size_t size_left = iocb_addr + iocb_size - uaddr;
+	size_t size = min(size_left,PAGE_SIZE);
+	unsigned long offset = uaddr % PAGE_SIZE;
+	unsigned long flags;
+
+	void* fptr;
+	void* tptr;
+
+	local_irq_save(flags);
+	fptr = kmap_atomic(from, KM_IRQ0);
+	tptr = kmap_atomic(to, KM_IRQ1);
+
+	memcpy(tptr + offset, fptr + offset, size);
+
+	kunmap_atomic(tptr, KM_IRQ1);
+	kunmap_atomic(fptr, KM_IRQ0);
+	local_irq_restore(flags);
+	set_page_dirty_lock(to);
 }
 
 /*
  * sdp_iocb_unlock - unlock the memory for an IOCB
+ * Copy if pages moved since.
+ * TODO: is this needed?
  */
 void sdp_iocb_unlock(struct sdpc_iocb *iocb)
 {
-	/*
-	 * check if IOCB is locked.
-	 */
+ 	int result;
+	struct page ** pages = NULL;
+	unsigned long uaddr;
+	int i;
+
 	if (!(iocb->flags & SDP_IOCB_F_LOCKED))
 		return;
-	/*
-	 * spin lock since this could be from interrupt context.
-	 */
-	down_write(&iocb->mm->mmap_sem);
-	
-	do_iocb_unlock(iocb);
-
-	up_write(&iocb->mm->mmap_sem);
 
+	/* For read, unlock and we are done */
+	if (!(iocb->flags & SDP_IOCB_F_RECV)) {
+		for (i = 0;i < iocb->page_count; ++i)
+			put_page(iocb->page_array[i]);
+		goto done;
+	}
+ 
+	/* For write, we must check the virtual pages did not get remapped */
+ 
+	/* As an optimisation (to avoid scanning the vma tree each time),
+	 * try to get all pages in one go. */
+	/* TODO: use cache for allocations? Allocate by chunks? */
+ 
+	pages = kmalloc((sizeof(struct page *) * iocb->page_count), GFP_KERNEL);
+	down_read(&iocb->mm->mmap_sem);
+	if (pages) {
+		result = get_user_pages(iocb->tsk, iocb->mm, iocb->addr,
+					iocb->page_count, 1, 0, pages, NULL);
+		if (result != iocb->page_count) {
+			kfree(pages);
+			pages = NULL;
+		}
+	}
+	for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
+	     ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
+	{
+		struct page* page;
+		set_page_dirty_lock(iocb->page_array[i]);
+
+		if (pages)
+			page = pages[i];
+		else {
+			result = get_user_pages(iocb->tsk, iocb->mm,
+						uaddr & PAGE_MASK,
+						1 , 1, 0, &page, NULL);
+			if (result != 1) {
+				page = NULL;
+			}
+		}
+		if (page && iocb->page_array[i] != page)
+			sdp_copy_one_page(iocb->page_array[i], page,
+					  iocb->addr, iocb->size, uaddr);
+		if (page)
+			put_page(page);
+		put_page(iocb->page_array[i]);
+ 	}
+	up_read(&iocb->mm->mmap_sem);
+	if (pages)
+		kfree(pages);
+ 
+done:
 	kfree(iocb->page_array);
-	kfree(iocb->addr_array);
+ 	kfree(iocb->addr_array);
 
 	iocb->page_array = NULL;
-	iocb->addr_array = NULL;
+ 	iocb->addr_array = NULL;
 	iocb->mm = NULL;
-	/*
-	 * mark IOCB unlocked.
-	 */
+	iocb->tsk = NULL;
 	iocb->flags &= ~SDP_IOCB_F_LOCKED;
 }
 
 /*
- * sdp_iocb_page_save - save page information for an IOCB
+ * sdp_iocb_lock - lock the memory for an IOCB
+ * We do not take a reference on the mm, AIO handles this for us.
  */
-static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
+int sdp_iocb_lock(struct sdpc_iocb *iocb)
 {
-	unsigned int counter;
+	int result = -ENOMEM;
 	unsigned long addr;
 	size_t size;
-	int result = -ENOMEM;
-	struct page *page;
-	unsigned long pfn;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep;
-	pte_t  pte;
+	int i;
 
-	if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
-		return -EINVAL;
-	/*
+ 	/*
+	 * iocb->addr - buffer start address
+	 * iocb->size - buffer length
+	 * addr       - page aligned
+	 * size       - page multiple
+ 	 */
+ 	addr = iocb->addr & PAGE_MASK;
+	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
+ 
+	iocb->page_offset = iocb->addr - addr;
+	
+ 	iocb->page_count = size >> PAGE_SHIFT;
+ 	/*
 	 * create array to hold page value which are later needed to register
 	 * the buffer with the HCA
-	 */
+ 	 */
+ 
+	/* TODO: use cache for allocations? Allocate by chunks? */
 	iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
 				   GFP_KERNEL);
 	if (!iocb->addr_array)
 		goto err_addr;
-
+ 
 	iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count), 
 				   GFP_KERNEL);
 	if (!iocb->page_array)
 		goto err_page;
-	/*
-	 * iocb->addr - buffer start address
-	 * iocb->size - buffer length
-	 * addr       - page aligned
-	 * size       - page multiple
-	 */
-	addr = iocb->addr & PAGE_MASK;
-	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
-
-	iocb->page_offset = iocb->addr - addr;
-	/*
-	 * Find pages used within the buffer which will then be registered
-	 * for RDMA
-	 */
-	spin_lock(&iocb->mm->page_table_lock);
-
-	for (counter = 0;
-	     size > 0;
-	     counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
-		pgd = pgd_offset_gate(iocb->mm, addr);
-		if (!pgd || pgd_none(*pgd))
-			break;
-
-		pud = pud_offset(pgd, addr);
-		if (!pud || pud_none(*pud))
-			break;
-		
-		pmd = pmd_offset(pud, addr);
-		if (!pmd || pmd_none(*pmd))
-			break;
-
-		ptep = pte_offset_map(pmd, addr);
-		if (!ptep) 
-			break;
-
-		pte = *ptep;
-		pte_unmap(ptep);
-
-		if (!pte_present(pte))
-			break;
-
-		pfn = pte_pfn(pte);
-		if (!pfn_valid(pfn))
-			break;
-        
-		page = pfn_to_page(pfn);
-
-		iocb->page_array[counter] = page;
-		iocb->addr_array[counter] = page_to_phys(page);
-	}
-
-	spin_unlock(&iocb->mm->page_table_lock);
-	
-	if (size > 0) {
-		result = -EFAULT;
-		goto err_find;
+ 
+	down_read(&current->mm->mmap_sem);
+ 
+        result = get_user_pages(current, current->mm,
+				iocb->addr, iocb->page_count,
+			      !!(iocb->flags & SDP_IOCB_F_RECV), 0,
+			      iocb->page_array, NULL);
+
+	up_read(&current->mm->mmap_sem);
+
+	if (result != iocb->page_count) {
+		sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
+			    iocb->addr, iocb->size, result, iocb->page_count);
+		goto err_get;
 	}
-
-	return 0;
-err_find:
-	
+ 
+	iocb->flags |= SDP_IOCB_F_LOCKED;
+	iocb->mm     = current->mm;
+	iocb->tsk    = current;
+ 
+ 
+	for (i = 0; i< iocb->page_count; ++i) {
+		iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
+ 	}
+ 
+ 	return 0;
+ 
+err_get:
 	kfree(iocb->page_array);
-	iocb->page_array = NULL;
 err_page:
-
 	kfree(iocb->addr_array);
-	iocb->addr_array = NULL;
 err_addr:
-
-	return result;
-}
-
-/*
- * sdp_iocb_lock - lock the memory for an IOCB
- */
-int sdp_iocb_lock(struct sdpc_iocb *iocb)
-{
-	struct vm_area_struct *vma;
-	kernel_cap_t real_cap;
-	unsigned long limit;
-	int result = -ENOMEM;
-	unsigned long addr;
-	size_t        size;
-	
-	/*
-	 * mark IOCB as locked. We do not take a reference on the mm, AIO
-	 * handles this for us.
-	 */
-	iocb->flags |= SDP_IOCB_F_LOCKED;
-	iocb->mm     = current->mm;
-	/*
-	 * save and raise capabilities
-	 */
-	real_cap = cap_t(current->cap_effective);
-	cap_raise(current->cap_effective, CAP_IPC_LOCK);
-  
-	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
-	addr = iocb->addr & PAGE_MASK;
-
-	iocb->page_count = size >> PAGE_SHIFT;
-
-	limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-	limit >>= PAGE_SHIFT;
-	/*
-	 * lock the mm, if within the limit lock the address range.
-	 */
-	down_write(&iocb->mm->mmap_sem);
-
-	if (!((iocb->page_count + current->mm->locked_vm) > limit))
-		result = (*mlock_ptr)(addr, size, 1);
-	/*
-	 * process result
-	 */
-	if (result) {
-		sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
-			    iocb->addr, iocb->size, result, 
-			    iocb->page_count, iocb->mm->locked_vm, limit);
-		goto err_lock;
-	}
-	/*
-	 * look up the head of the vma queue, loop through the vmas, marking
-	 * them do not copy, reference counting, and saving them. 
-	 */
-	vma = find_vma(iocb->mm, addr);
-	if (!vma)
-		/*
-		 * sanity check.
-		 */
-		sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
-			 iocb->addr, iocb->size);
-
-	while (vma) {
-		spin_lock(&iocb->mm->page_table_lock);
-
-		if (!(VM_LOCKED & vma->vm_flags))
-			sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
-
-		if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
-			sdp_dbg_err("VMA: private daya in use! <%08lx>",
-				    (unsigned long)vma->vm_private_data);
-		
-		vma->vm_flags |= VM_DONTCOPY;
-		vma->vm_private_data++;
-
-		spin_unlock(&iocb->mm->page_table_lock);
-
-		sdp_dbg_data(NULL, 
-			     "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
-			     iocb->addr, vma, vma->vm_start, vma->vm_end,
-			     vma->vm_flags, (long)vma->vm_private_data);
-
-		if ((addr + size) > vma->vm_end)
-			vma = vma->vm_next;
-		else
-			vma = NULL;
-	}
-
-	result = sdp_iocb_page_save(iocb);
-	if (result) {
-		sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>", 
-			    result, iocb->addr, iocb->size);
-		goto err_save;
-	}
-
-	up_write(&iocb->mm->mmap_sem);
-	cap_t(current->cap_effective) = real_cap;
-
-	return 0;
-err_save:
-
-	do_iocb_unlock(iocb);
-err_lock:
-	/*
-	 * unlock the mm and restore capabilities.
-	 */
-	up_write(&iocb->mm->mmap_sem);
-	cap_t(current->cap_effective) = real_cap;
-
-	iocb->flags &= ~SDP_IOCB_F_LOCKED;
-	iocb->mm = NULL;
-
-	return result;
-}
-
-/*
- * IOCB memory locking init functions
- */
-struct kallsym_iter {
-	loff_t pos;
-	struct module *owner;
-	unsigned long value;
-	unsigned int nameoff; /* If iterating in core kernel symbols */
-	char type;
-	char name[128];
-};
-
-/*
- * sdp_mem_lock_init - initialize the userspace memory locking
- */
-static int sdp_mem_lock_init(void)
-{
-	struct file *kallsyms;
-	struct seq_file *seq;
-	struct kallsym_iter *iter;
-	loff_t pos = 0;
-	int ret = -EINVAL;
-	
-	sdp_dbg_init("Memory Locking initialization.");
-	
-	kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
-	if (!kallsyms) {
-		sdp_warn("Failed to open /proc/kallsyms");
-		goto done;
-	}
-
-	seq = (struct seq_file *)kallsyms->private_data;
-	if (!seq) {
-		sdp_warn("Failed to fetch sequential file.");
-		goto err_close;
-	}
-
-	for (iter = seq->op->start(seq, &pos);
-	     iter != NULL;
-	     iter = seq->op->next(seq, iter, &pos))
-		if (!strcmp(iter->name, "do_mlock"))
-			mlock_ptr = (do_mlock_ptr_t)iter->value;
-
-	if (!mlock_ptr)
-		sdp_warn("Failed to find lock pointer.");
-	else
-		ret = 0;
-
-err_close:
-	filp_close(kallsyms, NULL);
-done:
-	return ret;
-}
-
-/*
- * sdp_mem_lock_cleanup - cleanup the memory locking tables
- */
-static void sdp_mem_lock_cleanup(void)
-{
-	sdp_dbg_init("Memory Locking cleanup.");
-	/*
-	 * null out entries.
-	 */
-	mlock_ptr = NULL;
+ 	return result;
 }
 
 /*
@@ -802,28 +607,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
 }
 
 /*
- * primary initialization/cleanup functions
- */
-
-/*
  * sdp_main_iocb_init - initialize the advertisment caches
  */
 int sdp_main_iocb_init(void)
 {
-	int result;
-
 	sdp_dbg_init("IOCB cache initialization.");
-	/*
-	 * initialize locking code.
-	 */
-	result =  sdp_mem_lock_init();
-	if (result < 0) {
-		sdp_warn("Error <%d> initializing memory locking.", result);
-		return result;
-	}
-	/*
-	 * initialize the caches only once.
-	 */
+
 	if (sdp_iocb_cache) {
 		sdp_warn("IOCB caches already initialized.");
 		return -EINVAL;
@@ -833,15 +622,10 @@ int sdp_main_iocb_init(void)
 					     sizeof(struct sdpc_iocb),
 					     0, SLAB_HWCACHE_ALIGN, NULL,
 					     NULL);
-	if (!sdp_iocb_cache) {
-		result = -ENOMEM;
-		goto error_iocb_c;
-	}
+	if (!sdp_iocb_cache)
+		return -ENOMEM;
 
 	return 0;
-error_iocb_c:
-	sdp_mem_lock_cleanup();
-	return result;
 }
 
 /*
@@ -858,8 +642,4 @@ void sdp_main_iocb_cleanup(void)
 	 * null out entries.
 	 */
 	sdp_iocb_cache = NULL;
-	/*
-	 * cleanup memory locking
-	 */
-	sdp_mem_lock_cleanup();
 }
Index: linux-2.6.12.2/drivers/infiniband/ulp/sdp/sdp_iocb.h
===================================================================
--- linux-2.6.12.2.orig/drivers/infiniband/ulp/sdp/sdp_iocb.h
+++ linux-2.6.12.2/drivers/infiniband/ulp/sdp/sdp_iocb.h
@@ -53,7 +53,8 @@
 #define SDP_IOCB_F_RDMA_R 0x00000010 /* IOCB is in RDMA read processing */
 #define SDP_IOCB_F_RDMA_W 0x00000020 /* IOCB is in RDMA write processing */
 #define SDP_IOCB_F_LOCKED 0x00000040 /* IOCB is locked in memory */
-#define SDP_IOCB_F_REG    0x00000080 /* IOCB is memory is registered */
+#define SDP_IOCB_F_REG    0x00000080 /* IOCB memory is registered */
+#define SDP_IOCB_F_RECV   0x00000100 /* IOCB is for a receive request */
 #define SDP_IOCB_F_ALL    0xFFFFFFFF /* IOCB all mask */
 /*
  * zcopy constants.
@@ -100,9 +101,10 @@ struct sdpc_iocb {
 	/*
 	 * page list. data for locking/registering userspace
 	 */
-	struct mm_struct *mm;      /* user mm struct */
-	unsigned long     addr;    /* user space address */
-	size_t            size;    /* total size of the user buffer */
+	struct mm_struct   *mm;      /* user mm struct */
+	struct task_struct *tsk;
+	unsigned long       addr;    /* user space address */
+	size_t              size;    /* total size of the user buffer */
 
 	struct page **page_array;  /* list of page structure pointers. */
 	u64          *addr_array;  /* list of physical page addresses. */

-- 
MST



More information about the general mailing list