[ewg] [PATCH] IB/libibverbs: Add huge page support to madvise_range

Moni Shoua monis at Voltaire.COM
Wed Feb 2 07:20:53 PST 2011


A different patch for the same purpose was already sent tolinux-rdma but had
not been accepted yet. This version of the patch however, was tested heavily 
by QA in Voltaire and in the field. When new libibverbs that fixes this  issue
comes out we can throw this one away but until then, let's use it.

ibv_reg_mr() fails to register a memory region allocated on huge page and not
the default page size. This happens because ibv_madvise_range() aligns memory
region to the default system page size before calling to madvise() which fails
with EINVAL error. madvise() fails because it expects that the start and end
pointer of the memory range be huge page aligned.
Patch handles the issue by:
1. ibv_fork_init() gets kernel's default huge page size in addition
   to the default page size.
2. ibv_madvise_range() first tries aligning users memory range to default
   page size and if madvise() fails with EINVAL error then it tries to align
   users memory range by huge page size and tries madvise() again.

Signed-off-by: Alex Vaynman <alexv at voltaire.com>
Reviewed-by:   Moni Shoua   <monis at voltaire.com>
---
 src/memory.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 68 insertions(+), 1 deletions(-)

diff --git a/src/memory.c b/src/memory.c
index 550015a..73db083 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -40,6 +40,9 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <string.h>
 
 #include "ibverbs.h"
 
@@ -54,6 +57,8 @@
 #define MADV_DOFORK	11
 #endif
 
+#define MEMINFO_SIZE	2048
+
 struct ibv_mem_node {
 	enum {
 		IBV_RED,
@@ -68,8 +73,51 @@ struct ibv_mem_node {
 static struct ibv_mem_node *mm_root;
 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
 static int page_size;
+static int huge_page_size;
 static int too_late;
 
+/*
+ * Get the kernel default huge page size.
+ */
+static int get_huge_page_size()
+{
+	int fd;
+	char buf[MEMINFO_SIZE];
+	int mem_file_len;
+	char *p_hpage_val = NULL;
+	char *end_pointer = NULL;
+	char file_name[] = "/proc/meminfo";
+	const char label[] = "Hugepagesize:";
+	int ret_val = 0;
+
+	fd = open(file_name, O_RDONLY);
+	if (fd < 0)
+		return fd;
+
+	mem_file_len = read(fd, buf, sizeof(buf) - 1);
+
+	close(fd);
+	if (mem_file_len < 0)
+		return mem_file_len;
+
+	buf[mem_file_len] = '\0';
+
+	p_hpage_val = strstr(buf, label);
+	if (!p_hpage_val) {
+		errno = EINVAL;
+		return -1;
+	}
+	p_hpage_val += strlen(label);
+
+	errno = 0;
+	ret_val = strtol(p_hpage_val, &end_pointer, 0);
+
+	if (errno != 0)
+		return -1;
+
+	return ret_val * 1024;
+}
+
 int ibv_fork_init(void)
 {
 	void *tmp;
@@ -85,6 +133,8 @@ int ibv_fork_init(void)
 	if (page_size < 0)
 		return errno;
 
+	huge_page_size = get_huge_page_size();
+
 	if (posix_memalign(&tmp, page_size, page_size))
 		return ENOMEM;
 
@@ -554,7 +604,8 @@ static struct ibv_mem_node *prepare_to_roll_back(struct ibv_mem_node *node,
 	return node;
 }
 
-static int ibv_madvise_range(void *base, size_t size, int advice)
+static int ibv_madvise_range_helper(void *base, size_t size, int advice,
+				    int page_size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -646,6 +697,22 @@ out:
 	return ret;
 }
 
+static int ibv_madvise_range(void *base, size_t size, int advice)
+{
+	int ret_val = 0;
+
+	ret_val = ibv_madvise_range_helper(base, size, advice, page_size);
+
+	/*
+	 * if memory is backed by huge pages we need to align it
+	 * to huge page boundary in order madvise() will succeed.
+	 */
+	if (ret_val == -1 && errno == EINVAL && huge_page_size > 0)
+		ret_val = ibv_madvise_range_helper(base, size, advice, huge_page_size);
+
+	return ret_val;
+}
+
 int ibv_dontfork_range(void *base, size_t size)
 {
 	if (mm_root)



More information about the ewg mailing list