[openib-general] [PATCH] AIO code to use get_user_pages
Michael S. Tsirkin
mst at mellanox.co.il
Thu Mar 10 04:19:41 PST 2005
Well, I went ahead and modified the AIO code to use get_user_pages.
Since we dont yet have fmr support, this patch is untested, but it
does compile :) Please let me know what do you think.
Another approach (instead of waiting for fmr support) could be
to add a fall-back option to use a regular memory region.
A todo item is to add zcopy support for synchronous operations.
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
Index: sdp_send.c
===================================================================
--- sdp_send.c (revision 1972)
+++ sdp_send.c (working copy)
@@ -2195,6 +2195,7 @@ skip: /* entry point for IOCB based tran
iocb->req = req;
iocb->key = req->ki_key;
iocb->addr = (unsigned long)msg->msg_iov->iov_base;
+ iocb->is_receive = 0;
req->ki_cancel = sdp_inet_write_cancel;
Index: sdp_recv.c
===================================================================
--- sdp_recv.c (revision 1972)
+++ sdp_recv.c (working copy)
@@ -1459,6 +1459,7 @@ int sdp_inet_recv(struct kiocb *req, st
iocb->req = req;
iocb->key = req->ki_key;
iocb->addr = (unsigned long)msg->msg_iov->iov_base;
+ iocb->is_receive = 1;
req->ki_cancel = sdp_inet_read_cancel;
Index: sdp_iocb.c
===================================================================
--- sdp_iocb.c (revision 1972)
+++ sdp_iocb.c (working copy)
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -31,89 +32,107 @@
*
* $Id$
*/
-
+#include <linux/pagemap.h>
#include "sdp_main.h"
static kmem_cache_t *sdp_iocb_cache = NULL;
-/*
- * memory locking functions
- */
-#include <linux/utsname.h>
-
-typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
-static do_mlock_ptr_t mlock_ptr = NULL;
+static void sdp_copy_one_page(struct page *from, struct page* to,
+ unsigned long iocb_addr, size_t iocb_size,
+ unsigned long uaddr)
+{
+ size_t size_left = iocb_addr + iocb_size - uaddr;
+ size_t size = min(size_left,PAGE_SIZE);
+ unsigned long offset = uaddr % PAGE_SIZE;
+ void* fptr;
+ void* tptr;
+
+ fptr = kmap_atomic(from, KM_USER0);
+ tptr = kmap_atomic(to, KM_USER0);
+
+ memcpy(tptr + offset, fptr + offset, size);
+
+ kunmap_atomic(tptr, KM_USER0);
+ kunmap_atomic(fptr, KM_USER0);
+ set_page_dirty_lock(to);
+}
/*
- * do_iocb_unlock - unlock the memory for an IOCB
+ * sdp_iocb_unlock - unlock the memory for an IOCB
+ * Copy if pages moved since.
*/
-static int do_iocb_unlock(struct sdpc_iocb *iocb)
+int sdp_iocb_unlock(struct sdpc_iocb *iocb)
{
- struct vm_area_struct *vma;
+ int result = 0;
+ struct page ** pages = NULL;
+ unsigned long uaddr;
+ int i;
- vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
- if (!vma)
- sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
- iocb->addr, iocb->size);
+ if (!(iocb->flags & SDP_IOCB_F_LOCKED))
+ return 0;
- while (vma) {
- sdp_dbg_data(NULL,
- "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
- iocb->addr, vma, vma->vm_start, vma->vm_end,
- vma->vm_flags, (long)vma->vm_private_data);
-
- spin_lock(&iocb->mm->page_table_lock);
- /*
- * if there are no more references to the vma
- */
- vma->vm_private_data--;
-
- if (!vma->vm_private_data) {
- /*
- * modify VM flags.
- */
- vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
- /*
- * adjust locked page count
- */
- vma->vm_mm->locked_vm -= ((vma->vm_end -
- vma->vm_start) >>
- PAGE_SHIFT);
- }
+ /* For read, unlock and we are done */
+ if (!iocb->is_receive) {
+ for (i = 0;i < iocb->page_count; ++i)
+ page_cache_release(iocb->page_array[i]);
+ goto done;
+ }
- spin_unlock(&iocb->mm->page_table_lock);
- /*
- * continue if the buffer continues onto the next vma
- */
- if ((iocb->addr + iocb->size) > vma->vm_end)
- vma = vma->vm_next;
- else
- vma = NULL;
+ /* For write, we must check the virtual pages did not get remapped */
+
+ /* As an optimisation (to avoid scanning the vma tree each time),
+ * try to get all pages in one go. */
+ /* TODO: use cache for allocations? Allocate by chunks? */
+
+ pages = kmalloc((sizeof(struct page *) *
+ iocb->page_count), GFP_KERNEL);
+
+ down_read(&iocb->mm->mmap_sem);
+
+ if (pages) {
+ result=get_user_pages(iocb->tsk, iocb->mm,
+ iocb->addr,
+ iocb->page_count , iocb->is_receive, 0,
+ pages, NULL);
+
+ if (result != iocb->page_count) {
+ kfree(pages);
+ pages = NULL;
+ }
}
- return 0;
-}
+ for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
+ ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
+ {
+ struct page* page;
+ set_page_dirty_lock(iocb->page_array[i]);
+
+ if (pages)
+ page = pages[i];
+ else {
+ result=get_user_pages(iocb->tsk, iocb->mm,
+ uaddr & PAGE_MASK,
+ 1 , 1, 0, &page, NULL);
+ if (result != 1) {
+ page = NULL;
+ }
+ }
-/*
- * sdp_iocb_unlock - unlock the memory for an IOCB
- */
-int sdp_iocb_unlock(struct sdpc_iocb *iocb)
-{
- int result;
+ if (page && iocb->page_array[i] != page)
+ sdp_copy_one_page(iocb->page_array[i], page,
+ iocb->addr, iocb->size, uaddr);
- /*
- * check if IOCB is locked.
- */
- if (!(iocb->flags & SDP_IOCB_F_LOCKED))
- return 0;
- /*
- * spin lock since this could be from interrupt context.
- */
- down_write(&iocb->mm->mmap_sem);
-
- result = do_iocb_unlock(iocb);
+ if (page)
+ page_cache_release(page);
+ page_cache_release(iocb->page_array[i]);
+ }
+
+ up_read(&iocb->mm->mmap_sem);
- up_write(&iocb->mm->mmap_sem);
+ if (pages)
+ kfree(pages);
+
+done:
kfree(iocb->page_array);
kfree(iocb->addr_array);
@@ -121,37 +140,41 @@ int sdp_iocb_unlock(struct sdpc_iocb *io
iocb->page_array = NULL;
iocb->addr_array = NULL;
iocb->mm = NULL;
- /*
- * mark IOCB unlocked.
- */
+ iocb->tsk = NULL;
+
iocb->flags &= ~SDP_IOCB_F_LOCKED;
return result;
}
/*
- * sdp_iocb_page_save - save page information for an IOCB
+ * sdp_iocb_lock - lock the memory for an IOCB
+ * We do not take a reference on the mm, AIO handles this for us.
*/
-static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
+int sdp_iocb_lock(struct sdpc_iocb *iocb)
{
- unsigned int counter;
+ int result = -ENOMEM;
unsigned long addr;
size_t size;
- int result = -ENOMEM;
- struct page *page;
- unsigned long pfn;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
- pte_t pte;
+ int i;
+ /*
+ * iocb->addr - buffer start address
+ * iocb->size - buffer length
+ * addr - page aligned
+ * size - page multiple
+ */
+ addr = iocb->addr & PAGE_MASK;
+ size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
- if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
- return -EINVAL;
+ iocb->page_offset = iocb->addr - addr;
+
+ iocb->page_count = size >> PAGE_SHIFT;
/*
* create array to hold page value which are later needed to register
* the buffer with the HCA
*/
+
+ /* TODO: use cache for allocations? Allocate by chunks? */
iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
GFP_KERNEL);
if (!iocb->addr_array)
@@ -161,259 +184,41 @@ static int sdp_iocb_page_save(struct sdp
GFP_KERNEL);
if (!iocb->page_array)
goto err_page;
- /*
- * iocb->addr - buffer start address
- * iocb->size - buffer length
- * addr - page aligned
- * size - page multiple
- */
- addr = iocb->addr & PAGE_MASK;
- size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
- iocb->page_offset = iocb->addr - addr;
- /*
- * Find pages used within the buffer which will then be registered
- * for RDMA
- */
- spin_lock(&iocb->mm->page_table_lock);
+ down_write(¤t->mm->mmap_sem);
- for (counter = 0;
- size > 0;
- counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
- pgd = pgd_offset_gate(iocb->mm, addr);
- if (!pgd || pgd_none(*pgd))
- break;
-
- pud = pud_offset(pgd, addr);
- if (!pud || pud_none(*pud))
- break;
-
- pmd = pmd_offset(pud, addr);
- if (!pmd || pmd_none(*pmd))
- break;
-
- ptep = pte_offset_map(pmd, addr);
- if (!ptep)
- break;
-
- pte = *ptep;
- pte_unmap(ptep);
-
- if (!pte_present(pte))
- break;
-
- pfn = pte_pfn(pte);
- if (!pfn_valid(pfn))
- break;
-
- page = pfn_to_page(pfn);
-
- iocb->page_array[counter] = page;
- iocb->addr_array[counter] = page_to_phys(page);
+ result=get_user_pages(current, current->mm, iocb->addr,
+ iocb->page_count , iocb->is_receive, 0,
+ iocb->page_array, NULL);
+
+ up_read(¤t->mm->mmap_sem);
+
+ if (result != iocb->page_count) {
+ sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
+ iocb->addr, iocb->size, result, iocb->page_count);
+ goto err_get;
}
- spin_unlock(&iocb->mm->page_table_lock);
-
- if (size > 0) {
- result = -EFAULT;
- goto err_find;
- }
-
- return 0;
-err_find:
-
- kfree(iocb->page_array);
- iocb->page_array = NULL;
-err_page:
-
- kfree(iocb->addr_array);
- iocb->addr_array = NULL;
-err_addr:
-
- return result;
-}
-
-/*
- * sdp_iocb_lock - lock the memory for an IOCB
- */
-int sdp_iocb_lock(struct sdpc_iocb *iocb)
-{
- struct vm_area_struct *vma;
- kernel_cap_t real_cap;
- unsigned long limit;
- int result = -ENOMEM;
- unsigned long addr;
- size_t size;
-
- /*
- * mark IOCB as locked. We do not take a reference on the mm, AIO
- * handles this for us.
- */
iocb->flags |= SDP_IOCB_F_LOCKED;
iocb->mm = current->mm;
- /*
- * save and raise capabilities
- */
- real_cap = cap_t(current->cap_effective);
- cap_raise(current->cap_effective, CAP_IPC_LOCK);
-
- size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
- addr = iocb->addr & PAGE_MASK;
-
- iocb->page_count = size >> PAGE_SHIFT;
+ iocb->tsk = current;
- limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- limit >>= PAGE_SHIFT;
- /*
- * lock the mm, if within the limit lock the address range.
- */
- down_write(&iocb->mm->mmap_sem);
- if (!((iocb->page_count + current->mm->locked_vm) > limit))
- result = (*mlock_ptr)(addr, size, 1);
- /*
- * process result
- */
- if (result) {
- sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
- iocb->addr, iocb->size, result,
- iocb->page_count, iocb->mm->locked_vm, limit);
- goto err_lock;
+ for (i = 0; i< iocb->page_count; ++i) {
+ iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
}
- /*
- * look up the head of the vma queue, loop through the vmas, marking
- * them do not copy, reference counting, and saving them.
- */
- vma = find_vma(iocb->mm, addr);
- if (!vma)
- /*
- * sanity check.
- */
- sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
- iocb->addr, iocb->size);
-
- while (vma) {
- spin_lock(&iocb->mm->page_table_lock);
-
- if (!(VM_LOCKED & vma->vm_flags))
- sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
-
- if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
- sdp_dbg_err("VMA: private daya in use! <%08lx>",
- (unsigned long)vma->vm_private_data);
-
- vma->vm_flags |= VM_DONTCOPY;
- vma->vm_private_data++;
-
- spin_unlock(&iocb->mm->page_table_lock);
-
- sdp_dbg_data(NULL,
- "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
- iocb->addr, vma, vma->vm_start, vma->vm_end,
- vma->vm_flags, (long)vma->vm_private_data);
-
- if ((addr + size) > vma->vm_end)
- vma = vma->vm_next;
- else
- vma = NULL;
- }
-
- result = sdp_iocb_page_save(iocb);
- if (result) {
- sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>",
- result, iocb->addr, iocb->size);
- goto err_save;
- }
-
- up_write(&iocb->mm->mmap_sem);
- cap_t(current->cap_effective) = real_cap;
return 0;
-err_save:
-
- (void)do_iocb_unlock(iocb);
-err_lock:
- /*
- * unlock the mm and restore capabilities.
- */
- up_write(&iocb->mm->mmap_sem);
- cap_t(current->cap_effective) = real_cap;
-
- iocb->flags &= ~SDP_IOCB_F_LOCKED;
- iocb->mm = NULL;
+err_get:
+ kfree(iocb->page_array);
+err_page:
+ kfree(iocb->addr_array);
+err_addr:
return result;
}
/*
- * IOCB memory locking init functions
- */
-struct kallsym_iter {
- loff_t pos;
- struct module *owner;
- unsigned long value;
- unsigned int nameoff; /* If iterating in core kernel symbols */
- char type;
- char name[128];
-};
-
-/*
- * sdp_mem_lock_init - initialize the userspace memory locking
- */
-static int sdp_mem_lock_init(void)
-{
- struct file *kallsyms;
- struct seq_file *seq;
- struct kallsym_iter *iter;
- loff_t pos = 0;
- int ret = -EINVAL;
-
- sdp_dbg_init("Memory Locking initialization.");
-
- kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
- if (!kallsyms) {
- sdp_warn("Failed to open /proc/kallsyms");
- goto done;
- }
-
- seq = (struct seq_file *)kallsyms->private_data;
- if (!seq) {
- sdp_warn("Failed to fetch sequential file.");
- goto err_close;
- }
-
- for (iter = seq->op->start(seq, &pos);
- iter != NULL;
- iter = seq->op->next(seq, iter, &pos))
- if (!strcmp(iter->name, "do_mlock"))
- mlock_ptr = (do_mlock_ptr_t)iter->value;
-
- if (!mlock_ptr)
- sdp_warn("Failed to find lock pointer.");
- else
- ret = 0;
-
-err_close:
- filp_close(kallsyms, NULL);
-done:
- return ret;
-}
-
-/*
- * sdp_mem_lock_cleanup - cleanup the memory locking tables
- */
-static int sdp_mem_lock_cleanup(void)
-{
- sdp_dbg_init("Memory Locking cleanup.");
- /*
- * null out entries.
- */
- mlock_ptr = NULL;
-
- return 0;
-}
-
-/*
* IOCB memory registration functions
*/
@@ -831,28 +636,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
}
/*
- * primary initialization/cleanup functions
- */
-
-/*
* sdp_main_iocb_init - initialize the advertisment caches
*/
int sdp_main_iocb_init(void)
{
- int result;
-
sdp_dbg_init("IOCB cache initialization.");
- /*
- * initialize locking code.
- */
- result = sdp_mem_lock_init();
- if (result < 0) {
- sdp_warn("Error <%d> initializing memory locking.", result);
- return result;
- }
- /*
- * initialize the caches only once.
- */
+
if (sdp_iocb_cache) {
sdp_warn("IOCB caches already initialized.");
return -EINVAL;
@@ -862,15 +651,10 @@ int sdp_main_iocb_init(void)
sizeof(struct sdpc_iocb),
0, SLAB_HWCACHE_ALIGN, NULL,
NULL);
- if (!sdp_iocb_cache) {
- result = -ENOMEM;
- goto error_iocb_c;
- }
+ if (!sdp_iocb_cache)
+ return -ENOMEM;
return 0;
-error_iocb_c:
- (void)sdp_mem_lock_cleanup();
- return result;
}
/*
@@ -879,16 +663,6 @@ error_iocb_c:
void sdp_main_iocb_cleanup(void)
{
sdp_dbg_init("IOCB cache cleanup.");
- /*
- * cleanup the caches
- */
kmem_cache_destroy(sdp_iocb_cache);
- /*
- * null out entries.
- */
sdp_iocb_cache = NULL;
- /*
- * cleanup memory locking
- */
- (void)sdp_mem_lock_cleanup();
}
Index: sdp_iocb.h
===================================================================
--- sdp_iocb.h (revision 1972)
+++ sdp_iocb.h (working copy)
@@ -99,9 +99,11 @@ struct sdpc_iocb {
/*
* page list. data for locking/registering userspace
*/
- struct mm_struct *mm; /* user mm struct */
- unsigned long addr; /* user space address */
- size_t size; /* total size of the user buffer */
+ struct mm_struct *mm; /* user mm struct */
+ struct task_struct *tsk;
+ unsigned long addr; /* user space address */
+ size_t size; /* total size of the user buffer */
+ int is_receive;
struct page **page_array; /* list of page structure pointers. */
u64 *addr_array; /* list of physical page addresses. */
--
MST - Michael S. Tsirkin
More information about the general
mailing list