[openib-general] [PATCH] AIO code to use get_user_pages

Michael S. Tsirkin mst at mellanox.co.il
Thu Mar 10 04:19:41 PST 2005


Well, I went ahead and modified the AIO code to use get_user_pages.
Since we dont yet have fmr support, this patch is untested, but it
does compile :) Please let me know what do you think.

Another approach (instead of waiting for fmr support) could be
to add a fall-back option to use a regular memory region.
A todo item is to add zcopy support for synchronous operations.

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>


Index: sdp_send.c
===================================================================
--- sdp_send.c	(revision 1972)
+++ sdp_send.c	(working copy)
@@ -2195,6 +2195,7 @@ skip: /* entry point for IOCB based tran
 		iocb->req  = req;
 		iocb->key  = req->ki_key;
 		iocb->addr = (unsigned long)msg->msg_iov->iov_base;
+		iocb->is_receive = 0;
       
 		req->ki_cancel = sdp_inet_write_cancel;
 
Index: sdp_recv.c
===================================================================
--- sdp_recv.c	(revision 1972)
+++ sdp_recv.c	(working copy)
@@ -1459,6 +1459,7 @@ int sdp_inet_recv(struct kiocb  *req, st
 			iocb->req  = req;
 			iocb->key  = req->ki_key;
 			iocb->addr = (unsigned long)msg->msg_iov->iov_base;
+			iocb->is_receive = 1;
       
 			req->ki_cancel = sdp_inet_read_cancel;
 
Index: sdp_iocb.c
===================================================================
--- sdp_iocb.c	(revision 1972)
+++ sdp_iocb.c	(working copy)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -31,89 +32,107 @@
  *
  * $Id$
  */
-
+#include <linux/pagemap.h>
 #include "sdp_main.h"
 
 static kmem_cache_t *sdp_iocb_cache = NULL;
 
-/*
- * memory locking functions
- */
-#include <linux/utsname.h>
-
-typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
-static do_mlock_ptr_t mlock_ptr = NULL;
+static void sdp_copy_one_page(struct page *from, struct page* to, 
+		      unsigned long iocb_addr, size_t iocb_size,
+		      unsigned long uaddr)
+{
+	size_t size_left = iocb_addr + iocb_size - uaddr;
+	size_t size = min(size_left,PAGE_SIZE);
+	unsigned long offset = uaddr % PAGE_SIZE;
+	void* fptr;
+	void* tptr;
+
+	fptr = kmap_atomic(from, KM_USER0);
+	tptr = kmap_atomic(to, KM_USER0);
+
+	memcpy(tptr + offset, fptr + offset, size);
+
+	kunmap_atomic(tptr, KM_USER0);
+	kunmap_atomic(fptr, KM_USER0);
+	set_page_dirty_lock(to);
+}
 
 /*
- * do_iocb_unlock - unlock the memory for an IOCB
+ * sdp_iocb_unlock - unlock the memory for an IOCB
+ * Copy if pages moved since.
  */
-static int do_iocb_unlock(struct sdpc_iocb *iocb)
+int sdp_iocb_unlock(struct sdpc_iocb *iocb)
 {
-	struct vm_area_struct *vma;
+	int result = 0;
+	struct page ** pages = NULL;
+	unsigned long uaddr;
+	int i;
 
-	vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
-	if (!vma)
-		sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
-			 iocb->addr, iocb->size);
+	if (!(iocb->flags & SDP_IOCB_F_LOCKED))
+		return 0;
 
-	while (vma) {
-		sdp_dbg_data(NULL, 
-			     "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
-			     iocb->addr, vma, vma->vm_start, vma->vm_end,
-			     vma->vm_flags, (long)vma->vm_private_data);
-		
-		spin_lock(&iocb->mm->page_table_lock);
-		/*
-		 * if there are no more references to the vma
-		 */
-		vma->vm_private_data--;
- 
-		if (!vma->vm_private_data) {
-			/*
-			 * modify VM flags.
-			 */
-			vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
-			/*
-			 * adjust locked page count
-			 */
-			vma->vm_mm->locked_vm -= ((vma->vm_end -
-						   vma->vm_start) >> 
-						  PAGE_SHIFT);
-		}
+	/* For read, unlock and we are done */
+	if (!iocb->is_receive) {
+		for (i = 0;i < iocb->page_count; ++i)
+			page_cache_release(iocb->page_array[i]);
+		goto done;
+	}
 
-		spin_unlock(&iocb->mm->page_table_lock);
-		/*
-		 * continue if the buffer continues onto the next vma
-		 */
-		if ((iocb->addr + iocb->size) > vma->vm_end)
-			vma = vma->vm_next;
-		else
-			vma = NULL;
+	/* For write, we must check the virtual pages did not get remapped */
+
+	/* As an optimisation (to avoid scanning the vma tree each time),
+	 * try to get all pages in one go. */
+	/* TODO: use cache for allocations? Allocate by chunks? */
+
+	pages = kmalloc((sizeof(struct page *) *
+			 iocb->page_count), GFP_KERNEL);
+
+	down_read(&iocb->mm->mmap_sem);
+
+	if (pages) {
+		result=get_user_pages(iocb->tsk, iocb->mm,
+				      iocb->addr,
+				      iocb->page_count , iocb->is_receive, 0,
+				      pages, NULL);
+
+		if (result != iocb->page_count) {
+			kfree(pages);
+			pages = NULL;
+		}
 	}
 
-	return 0;
-}
+	for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
+	     ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
+	{
+		struct page* page;
+		set_page_dirty_lock(iocb->page_array[i]);
+
+		if (pages)
+			page = pages[i];
+		else {
+			result=get_user_pages(iocb->tsk, iocb->mm,
+					      uaddr & PAGE_MASK,
+					      1 , 1, 0, &page, NULL);
+			if (result != 1) {
+				page = NULL;
+			}
+		}
 
-/*
- * sdp_iocb_unlock - unlock the memory for an IOCB
- */
-int sdp_iocb_unlock(struct sdpc_iocb *iocb)
-{
-	int result;
+		if (page && iocb->page_array[i] != page)
+			sdp_copy_one_page(iocb->page_array[i], page,
+					  iocb->addr, iocb->size, uaddr);
 
-	/*
-	 * check if IOCB is locked.
-	 */
-	if (!(iocb->flags & SDP_IOCB_F_LOCKED))
-		return 0;
-	/*
-	 * spin lock since this could be from interrupt context.
-	 */
-	down_write(&iocb->mm->mmap_sem);
-	
-	result = do_iocb_unlock(iocb);
+		if (page)
+			page_cache_release(page);
+		page_cache_release(iocb->page_array[i]);
+	}
+
+	up_read(&iocb->mm->mmap_sem);
 
-	up_write(&iocb->mm->mmap_sem);
+	if (pages)
+		kfree(pages);
+
+done:
 
 	kfree(iocb->page_array);
 	kfree(iocb->addr_array);
@@ -121,37 +140,41 @@ int sdp_iocb_unlock(struct sdpc_iocb *io
 	iocb->page_array = NULL;
 	iocb->addr_array = NULL;
 	iocb->mm = NULL;
-	/*
-	 * mark IOCB unlocked.
-	 */
+	iocb->tsk = NULL;
+
 	iocb->flags &= ~SDP_IOCB_F_LOCKED;
 
 	return result;
 }
 
 /*
- * sdp_iocb_page_save - save page information for an IOCB
+ * sdp_iocb_lock - lock the memory for an IOCB
+ * We do not take a reference on the mm, AIO handles this for us.
  */
-static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
+int sdp_iocb_lock(struct sdpc_iocb *iocb)
 {
-	unsigned int counter;
+	int result = -ENOMEM;
 	unsigned long addr;
 	size_t size;
-	int result = -ENOMEM;
-	struct page *page;
-	unsigned long pfn;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep;
-	pte_t  pte;
+	int i;
+	/*
+	 * iocb->addr - buffer start address
+	 * iocb->size - buffer length
+	 * addr       - page aligned
+	 * size       - page multiple
+	 */
+	addr = iocb->addr & PAGE_MASK;
+	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
 
-	if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
-		return -EINVAL;
+	iocb->page_offset = iocb->addr - addr;
+	
+	iocb->page_count = size >> PAGE_SHIFT;
 	/*
 	 * create array to hold page value which are later needed to register
 	 * the buffer with the HCA
 	 */
+
+	/* TODO: use cache for allocations? Allocate by chunks? */
 	iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
 				   GFP_KERNEL);
 	if (!iocb->addr_array)
@@ -161,259 +184,41 @@ static int sdp_iocb_page_save(struct sdp
 				   GFP_KERNEL);
 	if (!iocb->page_array)
 		goto err_page;
-	/*
-	 * iocb->addr - buffer start address
-	 * iocb->size - buffer length
-	 * addr       - page aligned
-	 * size       - page multiple
-	 */
-	addr = iocb->addr & PAGE_MASK;
-	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
 
-	iocb->page_offset = iocb->addr - addr;
-	/*
-	 * Find pages used within the buffer which will then be registered
-	 * for RDMA
-	 */
-	spin_lock(&iocb->mm->page_table_lock);
+	down_write(&current->mm->mmap_sem);
 
-	for (counter = 0;
-	     size > 0;
-	     counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
-		pgd = pgd_offset_gate(iocb->mm, addr);
-		if (!pgd || pgd_none(*pgd))
-			break;
-
-		pud = pud_offset(pgd, addr);
-		if (!pud || pud_none(*pud))
-			break;
-		
-		pmd = pmd_offset(pud, addr);
-		if (!pmd || pmd_none(*pmd))
-			break;
-
-		ptep = pte_offset_map(pmd, addr);
-		if (!ptep) 
-			break;
-
-		pte = *ptep;
-		pte_unmap(ptep);
-
-		if (!pte_present(pte))
-			break;
-
-		pfn = pte_pfn(pte);
-		if (!pfn_valid(pfn))
-			break;
-        
-		page = pfn_to_page(pfn);
-
-		iocb->page_array[counter] = page;
-		iocb->addr_array[counter] = page_to_phys(page);
+        result=get_user_pages(current, current->mm, iocb->addr,
+			      iocb->page_count , iocb->is_receive, 0,
+			      iocb->page_array, NULL);
+
+	up_read(&current->mm->mmap_sem);
+
+	if (result != iocb->page_count) {
+		sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
+			    iocb->addr, iocb->size, result, iocb->page_count);
+		goto err_get;
 	}
 
-	spin_unlock(&iocb->mm->page_table_lock);
-	
-	if (size > 0) {
-		result = -EFAULT;
-		goto err_find;
-	}
-
-	return 0;
-err_find:
-	
-	kfree(iocb->page_array);
-	iocb->page_array = NULL;
-err_page:
-
-	kfree(iocb->addr_array);
-	iocb->addr_array = NULL;
-err_addr:
-
-	return result;
-}
-
-/*
- * sdp_iocb_lock - lock the memory for an IOCB
- */
-int sdp_iocb_lock(struct sdpc_iocb *iocb)
-{
-	struct vm_area_struct *vma;
-	kernel_cap_t real_cap;
-	unsigned long limit;
-	int result = -ENOMEM;
-	unsigned long addr;
-	size_t        size;
-	
-	/*
-	 * mark IOCB as locked. We do not take a reference on the mm, AIO
-	 * handles this for us.
-	 */
 	iocb->flags |= SDP_IOCB_F_LOCKED;
 	iocb->mm     = current->mm;
-	/*
-	 * save and raise capabilities
-	 */
-	real_cap = cap_t(current->cap_effective);
-	cap_raise(current->cap_effective, CAP_IPC_LOCK);
-  
-	size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
-	addr = iocb->addr & PAGE_MASK;
-
-	iocb->page_count = size >> PAGE_SHIFT;
+	iocb->tsk    = current;
 
-	limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-	limit >>= PAGE_SHIFT;
-	/*
-	 * lock the mm, if within the limit lock the address range.
-	 */
-	down_write(&iocb->mm->mmap_sem);
 
-	if (!((iocb->page_count + current->mm->locked_vm) > limit))
-		result = (*mlock_ptr)(addr, size, 1);
-	/*
-	 * process result
-	 */
-	if (result) {
-		sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
-			    iocb->addr, iocb->size, result, 
-			    iocb->page_count, iocb->mm->locked_vm, limit);
-		goto err_lock;
+	for (i = 0; i< iocb->page_count; ++i) {
+		iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
 	}
-	/*
-	 * look up the head of the vma queue, loop through the vmas, marking
-	 * them do not copy, reference counting, and saving them. 
-	 */
-	vma = find_vma(iocb->mm, addr);
-	if (!vma)
-		/*
-		 * sanity check.
-		 */
-		sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
-			 iocb->addr, iocb->size);
-
-	while (vma) {
-		spin_lock(&iocb->mm->page_table_lock);
-
-		if (!(VM_LOCKED & vma->vm_flags))
-			sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
-
-		if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
-			sdp_dbg_err("VMA: private daya in use! <%08lx>",
-				    (unsigned long)vma->vm_private_data);
-		
-		vma->vm_flags |= VM_DONTCOPY;
-		vma->vm_private_data++;
-
-		spin_unlock(&iocb->mm->page_table_lock);
-
-		sdp_dbg_data(NULL, 
-			     "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
-			     iocb->addr, vma, vma->vm_start, vma->vm_end,
-			     vma->vm_flags, (long)vma->vm_private_data);
-
-		if ((addr + size) > vma->vm_end)
-			vma = vma->vm_next;
-		else
-			vma = NULL;
-	}
-
-	result = sdp_iocb_page_save(iocb);
-	if (result) {
-		sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>", 
-			    result, iocb->addr, iocb->size);
-		goto err_save;
-	}
-
-	up_write(&iocb->mm->mmap_sem);
-	cap_t(current->cap_effective) = real_cap;
 
 	return 0;
-err_save:
-
-	(void)do_iocb_unlock(iocb);
-err_lock:
-	/*
-	 * unlock the mm and restore capabilities.
-	 */
-	up_write(&iocb->mm->mmap_sem);
-	cap_t(current->cap_effective) = real_cap;
-
-	iocb->flags &= ~SDP_IOCB_F_LOCKED;
-	iocb->mm = NULL;
 
+err_get:
+	kfree(iocb->page_array);
+err_page:
+	kfree(iocb->addr_array);
+err_addr:
 	return result;
 }
 
 /*
- * IOCB memory locking init functions
- */
-struct kallsym_iter {
-	loff_t pos;
-	struct module *owner;
-	unsigned long value;
-	unsigned int nameoff; /* If iterating in core kernel symbols */
-	char type;
-	char name[128];
-};
-
-/*
- * sdp_mem_lock_init - initialize the userspace memory locking
- */
-static int sdp_mem_lock_init(void)
-{
-	struct file *kallsyms;
-	struct seq_file *seq;
-	struct kallsym_iter *iter;
-	loff_t pos = 0;
-	int ret = -EINVAL;
-	
-	sdp_dbg_init("Memory Locking initialization.");
-	
-	kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
-	if (!kallsyms) {
-		sdp_warn("Failed to open /proc/kallsyms");
-		goto done;
-	}
-
-	seq = (struct seq_file *)kallsyms->private_data;
-	if (!seq) {
-		sdp_warn("Failed to fetch sequential file.");
-		goto err_close;
-	}
-
-	for (iter = seq->op->start(seq, &pos);
-	     iter != NULL;
-	     iter = seq->op->next(seq, iter, &pos))
-		if (!strcmp(iter->name, "do_mlock"))
-			mlock_ptr = (do_mlock_ptr_t)iter->value;
-
-	if (!mlock_ptr)
-		sdp_warn("Failed to find lock pointer.");
-	else
-		ret = 0;
-
-err_close:
-	filp_close(kallsyms, NULL);
-done:
-	return ret;
-}
-
-/*
- * sdp_mem_lock_cleanup - cleanup the memory locking tables
- */
-static int sdp_mem_lock_cleanup(void)
-{
-	sdp_dbg_init("Memory Locking cleanup.");
-	/*
-	 * null out entries.
-	 */
-	mlock_ptr = NULL;
-	
-	return 0;
-}
-
-/*
  * IOCB memory registration functions
  */
 
@@ -831,28 +636,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
 }
 
 /*
- * primary initialization/cleanup functions
- */
-
-/*
  * sdp_main_iocb_init - initialize the advertisment caches
  */
 int sdp_main_iocb_init(void)
 {
-	int result;
-
 	sdp_dbg_init("IOCB cache initialization.");
-	/*
-	 * initialize locking code.
-	 */
-	result =  sdp_mem_lock_init();
-	if (result < 0) {
-		sdp_warn("Error <%d> initializing memory locking.", result);
-		return result;
-	}
-	/*
-	 * initialize the caches only once.
-	 */
+
 	if (sdp_iocb_cache) {
 		sdp_warn("IOCB caches already initialized.");
 		return -EINVAL;
@@ -862,15 +651,10 @@ int sdp_main_iocb_init(void)
 					     sizeof(struct sdpc_iocb),
 					     0, SLAB_HWCACHE_ALIGN, NULL,
 					     NULL);
-	if (!sdp_iocb_cache) {
-		result = -ENOMEM;
-		goto error_iocb_c;
-	}
+	if (!sdp_iocb_cache)
+		return -ENOMEM;
 
 	return 0;
-error_iocb_c:
-	(void)sdp_mem_lock_cleanup();
-	return result;
 }
 
 /*
@@ -879,16 +663,6 @@ error_iocb_c:
 void sdp_main_iocb_cleanup(void)
 {
 	sdp_dbg_init("IOCB cache cleanup.");
-	/*
-	 * cleanup the caches
-	 */
 	kmem_cache_destroy(sdp_iocb_cache);
-	/*
-	 * null out entries.
-	 */
 	sdp_iocb_cache = NULL;
-	/*
-	 * cleanup memory locking
-	 */
-	(void)sdp_mem_lock_cleanup();
 }
Index: sdp_iocb.h
===================================================================
--- sdp_iocb.h	(revision 1972)
+++ sdp_iocb.h	(working copy)
@@ -99,9 +99,11 @@ struct sdpc_iocb {
 	/*
 	 * page list. data for locking/registering userspace
 	 */
-	struct mm_struct *mm;      /* user mm struct */
-	unsigned long     addr;    /* user space address */
-	size_t            size;    /* total size of the user buffer */
+	struct mm_struct   *mm;      /* user mm struct */
+	struct task_struct *tsk;
+	unsigned long       addr;    /* user space address */
+	size_t              size;    /* total size of the user buffer */
+	int		    is_receive;
 
 	struct page **page_array;  /* list of page structure pointers. */
 	u64          *addr_array;  /* list of physical page addresses. */

-- 
MST - Michael S. Tsirkin



More information about the general mailing list