[openib-general] [PATCH] (repost) sdp: replace mlock with get_user_pages
Michael S. Tsirkin
mst at mellanox.co.il
Tue May 10 07:32:32 PDT 2005
Quoting r. Libor Michalek <libor at topspin.com>:
> Subject: Re: [PATCH] sdp: replace mlock with get_user_pages
>
> On Thu, May 05, 2005 at 02:01:58PM +0300, Michael S. Tsirkin wrote:
> > Hello, Libor!
> > The following patch replaces the mlock hack with call
> > to get_user_pages. Since the application could have forked
> > while an iocb is outstanding, when an iocb is done
> > I do get_user_pages for a second time and copy data if
> > the physical address has changed.
> >
> > Thus, changing ulimit is no longer required to get aio
> > working, processes are also allowed to fork and to call mlock/munlock
> > on the buffer.
> >
> > Tested by the ttcp.aio benchmark, and works fine for me.
>
> In the latest kernel what happens to a page that's held
> with a get_user_page reference on fork? Which is the only
> case left. Since there is an iocv flag parameter it would
> be prefered to use a bit there instead of is_receive. Also,
> please leave spaces around '='. Finally it would be nice
> if we could get rid of having to schedule a task to unlock
> the iocb in complete, I've noticed this having a negative
> effect on system CPU utilization on older kernels.
>
> -Libor
I dont see what would prevent the page
from being copied on a COW event. Do you? Certainly the patch in 2.6.7
that fixed the swap-out problem does not affect COW.
With regard to avoiding task scheduling, since current code already
needs this, so we are not breaking anything.
I put in comments so that we remember to do this research at some point
in time.
In the patch below the style issues are addressed: whitespace fixed,
is_receive changed to flag.
OK to check in?
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
Index: ulp/sdp/sdp_send.c
===================================================================
--- ulp/sdp/sdp_send.c (revision 2292)
+++ ulp/sdp/sdp_send.c (working copy)
@@ -2197,7 +2197,7 @@ skip: /* entry point for IOCB based tran
iocb->req = req;
iocb->key = req->ki_key;
iocb->addr = (unsigned long)msg->msg_iov->iov_base - copied;
-
+
req->ki_cancel = sdp_inet_write_cancel;
result = sdp_iocb_lock(iocb);
Index: ulp/sdp/sdp_recv.c
===================================================================
--- ulp/sdp/sdp_recv.c (revision 2292)
+++ ulp/sdp/sdp_recv.c (working copy)
@@ -1449,6 +1449,8 @@ int sdp_inet_recv(struct kiocb *req, st
iocb->addr = ((unsigned long)msg->msg_iov->iov_base -
copied);
+ iocb->flags |= SDP_IOCB_F_RECV;
+
req->ki_cancel = sdp_inet_read_cancel;
result = sdp_iocb_lock(iocb);
Index: ulp/sdp/sdp_iocb.c
===================================================================
--- ulp/sdp/sdp_iocb.c (revision 2292)
+++ ulp/sdp/sdp_iocb.c (working copy)
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -31,389 +32,196 @@
*
* $Id$
*/
-
+#include <linux/pagemap.h>
#include "sdp_main.h"
static kmem_cache_t *sdp_iocb_cache = NULL;
-/*
- * memory locking functions
- */
-#include <linux/utsname.h>
-
-typedef int (*do_mlock_ptr_t)(unsigned long, size_t, int);
-static do_mlock_ptr_t mlock_ptr = NULL;
-
-/*
- * do_iocb_unlock - unlock the memory for an IOCB
- */
-static int do_iocb_unlock(struct sdpc_iocb *iocb)
-{
- struct vm_area_struct *vma;
-
- vma = find_vma(iocb->mm, (iocb->addr & PAGE_MASK));
- if (!vma)
- sdp_warn("No VMA for IOCB <%lx:%Zu> unlock",
- iocb->addr, iocb->size);
-
- while (vma) {
- sdp_dbg_data(NULL,
- "unmark <%lx> <%p> <%08lx:%08lx> <%08lx> <%ld>",
- iocb->addr, vma, vma->vm_start, vma->vm_end,
- vma->vm_flags, (long)vma->vm_private_data);
-
- spin_lock(&iocb->mm->page_table_lock);
- /*
- * if there are no more references to the vma
- */
- vma->vm_private_data--;
-
- if (!vma->vm_private_data) {
- /*
- * modify VM flags.
- */
- vma->vm_flags &= ~(VM_DONTCOPY|VM_LOCKED);
- /*
- * adjust locked page count
- */
- vma->vm_mm->locked_vm -= ((vma->vm_end -
- vma->vm_start) >>
- PAGE_SHIFT);
- }
-
- spin_unlock(&iocb->mm->page_table_lock);
- /*
- * continue if the buffer continues onto the next vma
- */
- if ((iocb->addr + iocb->size) > vma->vm_end)
- vma = vma->vm_next;
- else
- vma = NULL;
- }
-
- return 0;
+static void sdp_copy_one_page(struct page *from, struct page* to,
+ unsigned long iocb_addr, size_t iocb_size,
+ unsigned long uaddr)
+{
+ size_t size_left = iocb_addr + iocb_size - uaddr;
+ size_t size = min(size_left,PAGE_SIZE);
+ unsigned long offset = uaddr % PAGE_SIZE;
+ unsigned long flags;
+
+ void* fptr;
+ void* tptr;
+
+ local_irq_save(flags);
+ fptr = kmap_atomic(from, KM_IRQ0);
+ tptr = kmap_atomic(to, KM_IRQ1);
+
+ memcpy(tptr + offset, fptr + offset, size);
+
+ kunmap_atomic(tptr, KM_IRQ1);
+ kunmap_atomic(fptr, KM_IRQ0);
+ local_irq_restore(flags);
+ set_page_dirty_lock(to);
}
/*
* sdp_iocb_unlock - unlock the memory for an IOCB
+ * Copy if pages moved since.
+ * TODO: is this needed?
*/
int sdp_iocb_unlock(struct sdpc_iocb *iocb)
{
int result;
+ struct page ** pages = NULL;
+ unsigned long uaddr;
+ int i;
+
- /*
- * check if IOCB is locked.
- */
if (!(iocb->flags & SDP_IOCB_F_LOCKED))
return 0;
- /*
- * spin lock since this could be from interrupt context.
- */
- down_write(&iocb->mm->mmap_sem);
-
- result = do_iocb_unlock(iocb);
- up_write(&iocb->mm->mmap_sem);
+ /* For read, unlock and we are done */
+ if (!(iocb->flags & SDP_IOCB_F_RECV)) {
+ for (i = 0;i < iocb->page_count; ++i)
+ put_page(iocb->page_array[i]);
+ goto done;
+ }
- kfree(iocb->page_array);
- kfree(iocb->addr_array);
+ /* For write, we must check the virtual pages did not get remapped */
- iocb->page_array = NULL;
- iocb->addr_array = NULL;
- iocb->mm = NULL;
- /*
- * mark IOCB unlocked.
- */
- iocb->flags &= ~SDP_IOCB_F_LOCKED;
+ /* As an optimisation (to avoid scanning the vma tree each time),
+ * try to get all pages in one go. */
+ /* TODO: use cache for allocations? Allocate by chunks? */
- return result;
-}
+ pages = kmalloc((sizeof(struct page *) * iocb->page_count), GFP_KERNEL);
-/*
- * sdp_iocb_page_save - save page information for an IOCB
- */
-static int sdp_iocb_page_save(struct sdpc_iocb *iocb)
-{
- unsigned int counter;
- unsigned long addr;
- size_t size;
- int result = -ENOMEM;
- struct page *page;
- unsigned long pfn;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
- pte_t pte;
+ down_read(&iocb->mm->mmap_sem);
- if (iocb->page_count <= 0 || iocb->size <= 0 || !iocb->addr)
- return -EINVAL;
- /*
- * create array to hold page value which are later needed to register
- * the buffer with the HCA
- */
- iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
- GFP_KERNEL);
- if (!iocb->addr_array)
- goto err_addr;
+ if (pages) {
+ result = get_user_pages(iocb->tsk, iocb->mm, iocb->addr,
+ iocb->page_count, 1, 0, pages, NULL);
- iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count),
- GFP_KERNEL);
- if (!iocb->page_array)
- goto err_page;
- /*
- * iocb->addr - buffer start address
- * iocb->size - buffer length
- * addr - page aligned
- * size - page multiple
- */
- addr = iocb->addr & PAGE_MASK;
- size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
+ if (result != iocb->page_count) {
+ kfree(pages);
+ pages = NULL;
+ }
+ }
- iocb->page_offset = iocb->addr - addr;
- /*
- * Find pages used within the buffer which will then be registered
- * for RDMA
- */
- spin_lock(&iocb->mm->page_table_lock);
+ for (i = 0, uaddr = iocb->addr; i < iocb->page_count;
+ ++i, uaddr = (uaddr & PAGE_MASK) + PAGE_SIZE)
+ {
+ struct page* page;
+ set_page_dirty_lock(iocb->page_array[i]);
+
+ if (pages)
+ page = pages[i];
+ else {
+ result = get_user_pages(iocb->tsk, iocb->mm,
+ uaddr & PAGE_MASK,
+ 1 , 1, 0, &page, NULL);
+ if (result != 1) {
+ page = NULL;
+ }
+ }
- for (counter = 0;
- size > 0;
- counter++, addr += PAGE_SIZE, size -= PAGE_SIZE) {
- pgd = pgd_offset_gate(iocb->mm, addr);
- if (!pgd || pgd_none(*pgd))
- break;
-
- pud = pud_offset(pgd, addr);
- if (!pud || pud_none(*pud))
- break;
-
- pmd = pmd_offset(pud, addr);
- if (!pmd || pmd_none(*pmd))
- break;
-
- ptep = pte_offset_map(pmd, addr);
- if (!ptep)
- break;
-
- pte = *ptep;
- pte_unmap(ptep);
-
- if (!pte_present(pte))
- break;
-
- pfn = pte_pfn(pte);
- if (!pfn_valid(pfn))
- break;
-
- page = pfn_to_page(pfn);
+ if (page && iocb->page_array[i] != page)
+ sdp_copy_one_page(iocb->page_array[i], page,
+ iocb->addr, iocb->size, uaddr);
- iocb->page_array[counter] = page;
- iocb->addr_array[counter] = page_to_phys(page);
+ if (page)
+ put_page(page);
+ put_page(iocb->page_array[i]);
}
- spin_unlock(&iocb->mm->page_table_lock);
-
- if (size > 0) {
- result = -EFAULT;
- goto err_find;
- }
+ up_read(&iocb->mm->mmap_sem);
- return 0;
-err_find:
-
- kfree(iocb->page_array);
- iocb->page_array = NULL;
-err_page:
+ if (pages)
+ kfree(pages);
+done:
+ kfree(iocb->page_array);
kfree(iocb->addr_array);
+
+ iocb->page_array = NULL;
iocb->addr_array = NULL;
-err_addr:
+ iocb->mm = NULL;
+ iocb->tsk = NULL;
- return result;
+ iocb->flags &= ~SDP_IOCB_F_LOCKED;
+
+ return 0;
}
/*
* sdp_iocb_lock - lock the memory for an IOCB
+ * We do not take a reference on the mm, AIO handles this for us.
*/
int sdp_iocb_lock(struct sdpc_iocb *iocb)
{
- struct vm_area_struct *vma;
- kernel_cap_t real_cap;
- unsigned long limit;
int result = -ENOMEM;
unsigned long addr;
- size_t size;
-
- /*
- * mark IOCB as locked. We do not take a reference on the mm, AIO
- * handles this for us.
- */
- iocb->flags |= SDP_IOCB_F_LOCKED;
- iocb->mm = current->mm;
+ size_t size;
+ int i;
/*
- * save and raise capabilities
+ * iocb->addr - buffer start address
+ * iocb->size - buffer length
+ * addr - page aligned
+ * size - page multiple
*/
- real_cap = cap_t(current->cap_effective);
- cap_raise(current->cap_effective, CAP_IPC_LOCK);
-
- size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
addr = iocb->addr & PAGE_MASK;
+ size = PAGE_ALIGN(iocb->size + (iocb->addr & ~PAGE_MASK));
+ iocb->page_offset = iocb->addr - addr;
+
iocb->page_count = size >> PAGE_SHIFT;
-
- limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- limit >>= PAGE_SHIFT;
- /*
- * lock the mm, if within the limit lock the address range.
- */
- down_write(&iocb->mm->mmap_sem);
-
- if (!((iocb->page_count + current->mm->locked_vm) > limit))
- result = (*mlock_ptr)(addr, size, 1);
/*
- * process result
- */
- if (result) {
- sdp_dbg_err("VMA lock <%lx:%Zu> error <%d> <%d:%lu:%lu>",
- iocb->addr, iocb->size, result,
- iocb->page_count, iocb->mm->locked_vm, limit);
- goto err_lock;
- }
- /*
- * look up the head of the vma queue, loop through the vmas, marking
- * them do not copy, reference counting, and saving them.
+ * create array to hold page value which are later needed to register
+ * the buffer with the HCA
*/
- vma = find_vma(iocb->mm, addr);
- if (!vma)
- /*
- * sanity check.
- */
- sdp_warn("No VMA for IOCB! <%lx:%Zu> lock",
- iocb->addr, iocb->size);
- while (vma) {
- spin_lock(&iocb->mm->page_table_lock);
+ /* TODO: use cache for allocations? Allocate by chunks? */
+ iocb->addr_array = kmalloc((sizeof(u64) * iocb->page_count),
+ GFP_KERNEL);
+ if (!iocb->addr_array)
+ goto err_addr;
- if (!(VM_LOCKED & vma->vm_flags))
- sdp_warn("Unlocked vma! <%08lx>", vma->vm_flags);
+ iocb->page_array = kmalloc((sizeof(struct page *) * iocb->page_count),
+ GFP_KERNEL);
+ if (!iocb->page_array)
+ goto err_page;
- if (PAGE_SIZE < (unsigned long)vma->vm_private_data)
- sdp_dbg_err("VMA: private daya in use! <%08lx>",
- (unsigned long)vma->vm_private_data);
-
- vma->vm_flags |= VM_DONTCOPY;
- vma->vm_private_data++;
+ down_read(¤t->mm->mmap_sem);
- spin_unlock(&iocb->mm->page_table_lock);
+ result = get_user_pages(current, current->mm,
+ iocb->addr, iocb->page_count,
+ !!(iocb->flags & SDP_IOCB_F_RECV), 0,
+ iocb->page_array, NULL);
+
+ up_read(¤t->mm->mmap_sem);
+
+ if (result != iocb->page_count) {
+ sdp_dbg_err("unable to lock <%lx:%Zu> error <%d> <%d>",
+ iocb->addr, iocb->size, result, iocb->page_count);
+ goto err_get;
+ }
- sdp_dbg_data(NULL,
- "mark <%lx> <0x%p> <%08lx:%08lx> <%08lx> <%ld>",
- iocb->addr, vma, vma->vm_start, vma->vm_end,
- vma->vm_flags, (long)vma->vm_private_data);
+ iocb->flags |= SDP_IOCB_F_LOCKED;
+ iocb->mm = current->mm;
+ iocb->tsk = current;
- if ((addr + size) > vma->vm_end)
- vma = vma->vm_next;
- else
- vma = NULL;
- }
- result = sdp_iocb_page_save(iocb);
- if (result) {
- sdp_dbg_err("Error <%d> saving pages for IOCB <%lx:%Zu>",
- result, iocb->addr, iocb->size);
- goto err_save;
+ for (i = 0; i< iocb->page_count; ++i) {
+ iocb->addr_array[i] = page_to_phys(iocb->page_array[i]);
}
- up_write(&iocb->mm->mmap_sem);
- cap_t(current->cap_effective) = real_cap;
-
return 0;
-err_save:
-
- (void)do_iocb_unlock(iocb);
-err_lock:
- /*
- * unlock the mm and restore capabilities.
- */
- up_write(&iocb->mm->mmap_sem);
- cap_t(current->cap_effective) = real_cap;
-
- iocb->flags &= ~SDP_IOCB_F_LOCKED;
- iocb->mm = NULL;
+err_get:
+ kfree(iocb->page_array);
+err_page:
+ kfree(iocb->addr_array);
+err_addr:
return result;
}
/*
- * IOCB memory locking init functions
- */
-struct kallsym_iter {
- loff_t pos;
- struct module *owner;
- unsigned long value;
- unsigned int nameoff; /* If iterating in core kernel symbols */
- char type;
- char name[128];
-};
-
-/*
- * sdp_mem_lock_init - initialize the userspace memory locking
- */
-static int sdp_mem_lock_init(void)
-{
- struct file *kallsyms;
- struct seq_file *seq;
- struct kallsym_iter *iter;
- loff_t pos = 0;
- int ret = -EINVAL;
-
- sdp_dbg_init("Memory Locking initialization.");
-
- kallsyms = filp_open("/proc/kallsyms", O_RDONLY, 0);
- if (!kallsyms) {
- sdp_warn("Failed to open /proc/kallsyms");
- goto done;
- }
-
- seq = (struct seq_file *)kallsyms->private_data;
- if (!seq) {
- sdp_warn("Failed to fetch sequential file.");
- goto err_close;
- }
-
- for (iter = seq->op->start(seq, &pos);
- iter != NULL;
- iter = seq->op->next(seq, iter, &pos))
- if (!strcmp(iter->name, "do_mlock"))
- mlock_ptr = (do_mlock_ptr_t)iter->value;
-
- if (!mlock_ptr)
- sdp_warn("Failed to find lock pointer.");
- else
- ret = 0;
-
-err_close:
- filp_close(kallsyms, NULL);
-done:
- return ret;
-}
-
-/*
- * sdp_mem_lock_cleanup - cleanup the memory locking tables
- */
-static int sdp_mem_lock_cleanup(void)
-{
- sdp_dbg_init("Memory Locking cleanup.");
- /*
- * null out entries.
- */
- mlock_ptr = NULL;
-
- return 0;
-}
-
-/*
* IOCB memory registration functions
*/
@@ -831,28 +639,12 @@ void sdp_iocb_q_clear(struct sdpc_iocb_q
}
/*
- * primary initialization/cleanup functions
- */
-
-/*
* sdp_main_iocb_init - initialize the advertisment caches
*/
int sdp_main_iocb_init(void)
{
- int result;
-
sdp_dbg_init("IOCB cache initialization.");
- /*
- * initialize locking code.
- */
- result = sdp_mem_lock_init();
- if (result < 0) {
- sdp_warn("Error <%d> initializing memory locking.", result);
- return result;
- }
- /*
- * initialize the caches only once.
- */
+
if (sdp_iocb_cache) {
sdp_warn("IOCB caches already initialized.");
return -EINVAL;
@@ -862,15 +654,10 @@ int sdp_main_iocb_init(void)
sizeof(struct sdpc_iocb),
0, SLAB_HWCACHE_ALIGN, NULL,
NULL);
- if (!sdp_iocb_cache) {
- result = -ENOMEM;
- goto error_iocb_c;
- }
+ if (!sdp_iocb_cache)
+ return -ENOMEM;
return 0;
-error_iocb_c:
- (void)sdp_mem_lock_cleanup();
- return result;
}
/*
@@ -879,16 +666,6 @@ error_iocb_c:
void sdp_main_iocb_cleanup(void)
{
sdp_dbg_init("IOCB cache cleanup.");
- /*
- * cleanup the caches
- */
kmem_cache_destroy(sdp_iocb_cache);
- /*
- * null out entries.
- */
sdp_iocb_cache = NULL;
- /*
- * cleanup memory locking
- */
- (void)sdp_mem_lock_cleanup();
}
Index: ulp/sdp/sdp_iocb.h
===================================================================
--- ulp/sdp/sdp_iocb.h (revision 2292)
+++ ulp/sdp/sdp_iocb.h (working copy)
@@ -52,7 +52,8 @@
#define SDP_IOCB_F_RDMA_R 0x00000010 /* IOCB is in RDMA read processing */
#define SDP_IOCB_F_RDMA_W 0x00000020 /* IOCB is in RDMA write processing */
#define SDP_IOCB_F_LOCKED 0x00000040 /* IOCB is locked in memory */
-#define SDP_IOCB_F_REG 0x00000080 /* IOCB is memory is registered */
+#define SDP_IOCB_F_REG 0x00000080 /* IOCB memory is registered */
+#define SDP_IOCB_F_RECV 0x00000100 /* IOCB is for a receive request */
#define SDP_IOCB_F_ALL 0xFFFFFFFF /* IOCB all mask */
/*
* zcopy constants.
@@ -99,9 +100,10 @@ struct sdpc_iocb {
/*
* page list. data for locking/registering userspace
*/
- struct mm_struct *mm; /* user mm struct */
- unsigned long addr; /* user space address */
- size_t size; /* total size of the user buffer */
+ struct mm_struct *mm; /* user mm struct */
+ struct task_struct *tsk;
+ unsigned long addr; /* user space address */
+ size_t size; /* total size of the user buffer */
struct page **page_array; /* list of page structure pointers. */
u64 *addr_array; /* list of physical page addresses. */
--
MST - Michael S. Tsirkin
More information about the general
mailing list