[openib-general] kernel VM monitor for memory registration caching

Pete Wyckoff pw at osc.edu
Mon Aug 1 10:55:16 PDT 2005


glebn at voltaire.com wrote on Sun, 31 Jul 2005 13:31 +0300:
>  First of all, you have one user_delta per mm that user can poll from 
> userspace. Is it possible to make user_delta to be part of dreg_region
> instead of dreg_context and module will set it whenever
> registration becomes invalid. Field 'invalid' will be added to buf_info
> structure and pointer to it will be passed to kernel at registration
> time.
>  This way the userpace can look up cache and check if registration is
> still valid. No need to rescan cache from userspace, we already scanned
> it once from kernel after all. With your current approach userspace will
> need to search for mr_handle in the cache and invalidate the entry that
> holds it.

I still think that idea is possible.  But perhaps someone who wants
to integrate this with a cache can do so sometime.  There is no real
cache structure in this userspace test code.

>  You change vma_ops in vma to catch open/close events. What about
> nopage() method in vma_ops? We have to forward it to original vma_ops?
> 
> Something like included patch (not even compiled).

Right, good catch.  I did something a bit different to avoid kmalloc
in the common case of empty origvma->vm_ops (i.e. normal memory).
But for the sake of correctness the fix must be there.

Note that these vm_ops structures don't really stack properly.  If
there happened to be some other kernel component that decided to
take ownership of the vm_ops structure, dreg would fail to notice
that it was the "owner" of the vma already, and it might free the
ops incorrectly.  Ideally there would be a chain of these things
that would get called from higher up that would not interfere with
each other.  As it is, dreg assumes it is the ultimate owner and
that other interfering vma ops have already done their business.

This stacked usage would be the rare case.  If you're communicating
using buffers from, say, your video or sound card memory, or a
hugetlb space, or SYSV shm, this will arise.  But in the case of shm
and hugetlb and DRM video, we're okay, as they set the ops in mmap()
and don't change it, ever.  IB memory registrations cannot happen
until the memory is mapped, so dreg will always be last.  Arbitrary
uses might cause failure someday, however.

		-- Pete


diff -u -p -r1.4 dreg.c
--- dreg.c	29 Jul 2005 17:17:52 -0000	1.4
+++ dreg.c	1 Aug 2005 17:41:00 -0000
@@ -96,6 +96,15 @@ static void dreg_vm_close(struct vm_area
 static void mem_deregister(struct dreg_context *dc, struct dreg_region *reg);
 
 /*
+ * Interface with VM.  It calls us back when one of these events happen on
+ * a VMA we care about.
+ */
+static struct vm_operations_struct dreg_vm_ops = {
+    .open = dreg_vm_open,
+    .close = dreg_vm_close,
+};
+
+/*
  * Helpful functions.
  */
 static struct dreg_context *find_context_by_mm(const struct mm_struct *mm)
@@ -161,8 +170,11 @@ static void destroy_region(struct dreg_c
     struct vm_area_struct *vma = reg->vma;
 
     pr_debug("%s: reg %p vma %p addr %lx\n", __func__, reg, vma, reg->addr);
-    if (vma)
+    if (vma) {
+	if (vma->vm_ops != &dreg_vm_ops)
+	    kfree(vma->vm_ops);
 	vma->vm_ops = reg->orig_ops;
+    }
     if (reg->addr)
 	mem_deregister(dc, reg);
     list_del(&reg->subordinate_list);
@@ -172,15 +184,6 @@ static void destroy_region(struct dreg_c
 }
 
 /*
- * Interface with VM.  It calls us back when one of these events happen on
- * a VMA we care about.
- */
-static struct vm_operations_struct dreg_vm_ops = {
-    .open = dreg_vm_open,
-    .close = dreg_vm_close,
-};
-
-/*
  * Triggered from VM activity.  Pass through to underlying vm_ops
  * if it exists after we finish.
  *
@@ -305,6 +308,10 @@ static void dreg_vm_open(struct vm_area_
      * forget about it and do not build a new region for it.
      */
     if (list_empty(&temp_new_subordinate_list)) {
+	/*
+	 * Do not free the newvma ops, it is just a copy, free handled when
+	 * original vma->vm_ops is destroyed.
+	 */
 	newvma->vm_ops = orig_ops;
     } else {
 	reg = kmem_cache_alloc(dreg_region_cache, GFP_KERNEL);
@@ -318,6 +325,14 @@ static void dreg_vm_open(struct vm_area_
 	}
 
 	reg->orig_ops = orig_ops;
+	if (orig_ops) {
+	    newvma->vm_ops = kmalloc(sizeof(*newvma->vm_ops), GFP_KERNEL);
+	    *newvma->vm_ops = *orig_ops;
+	    newvma->vm_ops->open = dreg_vm_open;
+	    newvma->vm_ops->close = dreg_vm_close;
+	} else {
+	    newvma->vm_ops = &dreg_vm_ops;
+	}
 	/* non subordinate */
 	reg->vma = newvma;
 	list_add(&reg->subordinate_list, &temp_new_subordinate_list);
@@ -526,7 +541,15 @@ static int dreg_write_monitor(struct dre
 	/* non subordinate */
 	reg->vma = vma;
 	INIT_LIST_HEAD(&reg->subordinate_list);
-	vma->vm_ops = &dreg_vm_ops;  /* own this vma */
+	if (reg->orig_ops) {
+	    /* copy it in case it uses more than open/close */
+	    vma->vm_ops = kmalloc(sizeof(*vma->vm_ops), GFP_KERNEL);
+	    *vma->vm_ops = *reg->orig_ops;
+	    vma->vm_ops->open = dreg_vm_open;
+	    vma->vm_ops->close = dreg_vm_close;
+	} else {
+	    vma->vm_ops = &dreg_vm_ops;  /* own this vma */
+	}
 	reg->orig_vm_start = vma->vm_start;
 	reg->orig_vm_end = vma->vm_end;
     }
@@ -564,6 +587,8 @@ static int dreg_write_unmonitor(struct d
 		if (list_empty(&reg->subordinate_list)) {
 		    pr_debug("%s: mr %x empty subord, releasing vma\n",
 		      __func__, mrh);
+		    if (reg->vma->vm_ops != &dreg_vm_ops)
+			kfree(reg->vma->vm_ops);
 		    reg->vma->vm_ops = reg->orig_ops;
 		} else {
 		    struct dreg_region *treg;



More information about the general mailing list