[openib-general] [PATCH] huge pages support

Robert Rex robert.rex at s2001.tu-chemnitz.de
Fri Aug 18 06:23:41 PDT 2006


Hello,

I've also worked on the same topic. Here is what I've done so far as I 
successfully tested it on mthca and ehca. I'd appreciate for comments and 
suggestions.

diff -Nurp a/drivers/infiniband/core/uverbs_mem.c b/drivers/infiniband/core/uverbs_mem.c
--- old/drivers/infiniband/core/uverbs_mem.c	2006-08-15 05:42:06.000000000 -0700
+++ new/drivers/infiniband/core/uverbs_mem.c	2006-08-18 04:22:22.000000000 -0700
@@ -36,6 +36,7 @@
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/hugetlb.h>
 
 #include "uverbs.h"
 
@@ -73,6 +74,8 @@ int ib_umem_get(struct ib_device *dev, s
 	unsigned long lock_limit;
 	unsigned long cur_base;
 	unsigned long npages;
+	unsigned long region_page_mask, region_page_shift, region_page_size;
+	int use_hugepages;
 	int ret = 0;
 	int off;
 	int i;
@@ -84,19 +87,39 @@ int ib_umem_get(struct ib_device *dev, s
 	if (!page_list)
 		return -ENOMEM;
 
+	down_read(&current->mm->mmap_sem);
+	if (is_vm_hugetlb_page(find_vma(current->mm, (unsigned long) addr))) {
+		use_hugepages           = 1;
+		region_page_mask        = HPAGE_MASK;
+		region_page_size        = HPAGE_SIZE;
+	} else {
+		use_hugepages           = 0;
+		region_page_mask        = PAGE_MASK;
+		region_page_size        = PAGE_SIZE;
+	}
+	up_read(&current->mm->mmap_sem);
+
+	region_page_shift = ffs(region_page_size) - 1;
+
 	mem->user_base = (unsigned long) addr;
 	mem->length    = size;
-	mem->offset    = (unsigned long) addr & ~PAGE_MASK;
-	mem->page_size = PAGE_SIZE;
+	mem->offset    = (unsigned long) addr & ~region_page_mask;
+	mem->page_size = region_page_size;
 	mem->writable  = write;
 
 	INIT_LIST_HEAD(&mem->chunk_list);
 
-	npages = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT;
+	npages = ((size + mem->offset + (region_page_size - 1)) &
+			(~(region_page_size - 1))) >> region_page_shift;
 
 	down_write(&current->mm->mmap_sem);
 
-	locked     = npages + current->mm->locked_vm;
+	if (use_hugepages)
+		locked = npages * (HPAGE_SIZE / PAGE_SIZE) +
+				current->mm->locked_vm;
+	else
+		locked = npages + current->mm->locked_vm;
+
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
@@ -104,19 +127,34 @@ int ib_umem_get(struct ib_device *dev, s
 		goto out;
 	}
 
-	cur_base = (unsigned long) addr & PAGE_MASK;
+	cur_base = (unsigned long) addr & region_page_mask;
 
 	while (npages) {
-		ret = get_user_pages(current, current->mm, cur_base,
-				     min_t(int, npages,
-					   PAGE_SIZE / sizeof (struct page *)),
-				     1, !write, page_list, NULL);
+		if (!use_hugepages) {
+			ret = get_user_pages(current, current->mm, cur_base,
+					min_t(int, npages, PAGE_SIZE
+						/ sizeof (struct page *)),
+					1, !write, page_list, NULL);
 
-		if (ret < 0)
-			goto out;
+			if (ret < 0)
+				goto out;
+
+			cur_base += ret * PAGE_SIZE;
+			npages   -= ret;
+		} else	{
+			while (npages && (ret <= PAGE_SIZE /
+						sizeof (struct page *))) {
+				if (get_user_pages(current, current->mm,
+						cur_base, 1, 1, !write,
+						&page_list[ret], NULL) < 0)
+					goto out;
+
+				ret++;
+				cur_base += HPAGE_SIZE;
+				npages--;
+			}
 
-		cur_base += ret * PAGE_SIZE;
-		npages   -= ret;
+		}
 
 		off = 0;
 
@@ -133,7 +171,7 @@ int ib_umem_get(struct ib_device *dev, s
 			for (i = 0; i < chunk->nents; ++i) {
 				chunk->page_list[i].page   = page_list[i + off];
 				chunk->page_list[i].offset = 0;
-				chunk->page_list[i].length = PAGE_SIZE;
+				chunk->page_list[i].length = region_page_size;
 			}
 
 			chunk->nmap = dma_map_sg(dev->dma_device,





More information about the general mailing list