[ofw] [patch][mlx4] enlarge the bus driver internal limitation on the system memory size from 256 GB to 8 TB

Fab Tillier ftillier at microsoft.com
Tue Dec 14 00:38:10 PST 2010


It was fixed starting Windows 7/Server 2008 R2.

It may make sense to trap the target OS version in the build and conditionally build the cache?  I would think the simpler we can make the code, the better.  Then we can mark the older OS code with descriptive checks to allow deleting it as we drop support for older operating systems.

-Fab

From: Leonid Keller [mailto:leonid at mellanox.co.il]
Sent: Monday, December 13, 2010 2:12 AM
To: Fab Tillier; 'ofw at lists.openfabrics.org'
Subject: RE: [ofw] [patch][mlx4] enlarge the bus driver internal limitation on the system memory size from 256 GB to 8 TB

Good news, I didn't know.
Do you know in what version was it fixed ?
We still support win2k3 and winxp ...

From: Fab Tillier [mailto:ftillier at microsoft.com]
Sent: Monday, December 13, 2010 7:14 AM
To: Leonid Keller; 'ofw at lists.openfabrics.org'
Subject: RE: [ofw] [patch][mlx4] enlarge the bus driver internal limitation on the system memory size from 256 GB to 8 TB

Note that the whole cache can go away - the memory corruption when probing pages for write access has been fixed, and you can just call MmProbeAndLockPages.

-Fab

From: ofw-bounces at lists.openfabrics.org [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Leonid Keller
Sent: Sunday, December 12, 2010 6:02 AM
To: 'ofw at lists.openfabrics.org'
Subject: [ofw] [patch][mlx4] enlarge the bus driver internal limitation on the system memory size from 256 GB to 8 TB

The bus driver memory registration mechanism is using internal cash that sets some limitation on the system memory size.
Till now it was 256 GB, but a customer of us has machines with up to 1 TB system memory...

Index: hw/mlx4/kernel/bus/core/pa_cash.c
===================================================================
--- hw/mlx4/kernel/bus/core/pa_cash.c               (revision 3023)
+++ hw/mlx4/kernel/bus/core/pa_cash.c            (working copy)
@@ -50,9 +50,10 @@
 ///////////////////////////////////////////////////////////////////////////

 #ifdef _WIN64
-#define MAX_PAGES_SUPPORTED        (64 * 1024 * 1024)                            // 256 GB
+// be careful with setting it >= 4G. Compiler puts it into an integer, so 4*1024*1024*1024 = 0 !!!
+#define MAX_PAGES_SUPPORTED       ((u32)2 * 1024 * 1024 * 1024)                                                                      // 8 TB
 #else
-#define MAX_PAGES_SUPPORTED        (16 * 1024 * 1024)                            // 64 GB
+#define MAX_PAGES_SUPPORTED       ((u32)16 * 1024 * 1024)                                                                                                                 // 64 GB
 #endif

 #define FREE_LIST_TRESHOLD                  256                         // max number of pages in free list
@@ -63,13 +64,9 @@
 //
 ///////////////////////////////////////////////////////////////////////////

-#define PA_TABLE_ENTRY_SIZE              sizeof(pa_table_entry_t)
-#define PA_TABLE_ENTRY_NUM           (PAGE_SIZE / PA_TABLE_ENTRY_SIZE)
-#define PA_TABLE_SIZE                                              (PA_TABLE_ENTRY_SIZE * PA_TABLE_ENTRY_NUM)
+#define PA_TABLE_ENTRY_SIZE                             sizeof(pa_table_entry_t)                                                                             // 4

-#define PA_DIR_ENTRY_SIZE                   sizeof(pa_dir_entry_t)
-#define PA_DIR_ENTRY_NUM                 (MAX_PAGES_SUPPORTED /PA_TABLE_ENTRY_NUM)
-#define PA_DIR_SIZE                                                   (PA_DIR_ENTRY_SIZE * PA_DIR_ENTRY_NUM)
+#define PA_DIR_ENTRY_SIZE                  sizeof(pa_dir_entry_t)                                                                                  // 16 for x64


 ///////////////////////////////////////////////////////////////////////////
@@ -107,6 +104,11 @@
 DEFINE_MUTEX(g_pa_mutex);
 u64 g_pa[1024];
 pa_cash_t g_cash;
+u32 g_max_pages_supported = 0;
+u32 g_pa_table_entry_num = 0;
+u32 g_pa_table_size = 0;
+u32 g_pa_dir_entry_num = 0;
+u32 g_pa_dir_size = 0;


 ///////////////////////////////////////////////////////////////////////////
@@ -133,7 +135,7 @@
                                g_cash.free_nr_pages--;
                }
                else  /* allocate new page */
-                              pa_te = (pa_table_entry_t *)kzalloc( PA_TABLE_SIZE, GFP_KERNEL );
+                             pa_te = (pa_table_entry_t *)kzalloc( g_pa_table_size, GFP_KERNEL );

                return pa_te;
 }
@@ -150,15 +152,15 @@

 static pa_table_entry_t * pa_get_page(uint32_t ix)
 {
-              pa_table_entry_t *pa_te =  g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].pa_te;
+             pa_table_entry_t *pa_te =  g_cash.pa_dir[ix / g_pa_table_entry_num].pa_te;

                /* no this page_table - add a new one */
                if (!pa_te) {
                                pa_te = pa_alloc_page();
                                if (!pa_te)
                                                return NULL;
-                              g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].pa_te = pa_te;
-                              g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].used = 0;
+                             g_cash.pa_dir[ix / g_pa_table_entry_num].pa_te = pa_te;
+                             g_cash.pa_dir[ix / g_pa_table_entry_num].used = 0;
                                g_cash.cur_nr_pages++;
                }

@@ -167,8 +169,8 @@

 static void pa_put_page(uint32_t ix)
 {
-              pa_free_page(g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].pa_te);
-              g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].pa_te = NULL;
+             pa_free_page(g_cash.pa_dir[ix / g_pa_table_entry_num].pa_te);
+             g_cash.pa_dir[ix / g_pa_table_entry_num].pa_te = NULL;
                g_cash.cur_nr_pages--;
 }

@@ -189,9 +191,9 @@
                                return -ENOMEM;

                /* register page address */
-              if (!pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt)
-                              ++g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].used;
-              ++pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt;
+             if (!pa_te[ix % g_pa_table_entry_num].ref_cnt)
+                             ++g_cash.pa_dir[ix / g_pa_table_entry_num].used;
+             ++pa_te[ix % g_pa_table_entry_num].ref_cnt;

                return 0;
 }
@@ -208,7 +210,7 @@
                                return -EFAULT;
                }

-              pa_te =  g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].pa_te;
+             pa_te =  g_cash.pa_dir[ix / g_pa_table_entry_num].pa_te;

                /* no this page_table - error*/
                if (!pa_te)  {
@@ -217,13 +219,13 @@
                }

                /* deregister page address */
-              --pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt;
-              ASSERT(pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt >= 0);
+             --pa_te[ix % g_pa_table_entry_num].ref_cnt;
+             ASSERT(pa_te[ix % g_pa_table_entry_num].ref_cnt >= 0);

                /* release the page on need */
-              if (!pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt)
-                              --g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].used;
-              if (!g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].used)
+             if (!pa_te[ix % g_pa_table_entry_num].ref_cnt)
+                             --g_cash.pa_dir[ix / g_pa_table_entry_num].used;
+             if (!g_cash.pa_dir[ix / g_pa_table_entry_num].used)
                                pa_put_page(ix);

                return 0;
@@ -301,7 +303,7 @@

 void pa_cash_release()
 {
-              int i;
+             u32 i;

                pa_cash_print();

@@ -309,7 +311,7 @@
                                return;

                /* free cash tables */
-              for (i=0; i<PA_DIR_ENTRY_NUM; ++i)
+             for (i=0; i<g_pa_dir_entry_num; ++i)
                                if (g_cash.pa_dir[i].pa_te) {
                                                kfree(g_cash.pa_dir[i].pa_te);
                                                g_cash.cur_nr_pages--;
@@ -338,24 +340,31 @@
                                return -EFAULT;
                }

-              pa_te =  g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM].pa_te;
+             pa_te =  g_cash.pa_dir[ix / g_pa_table_entry_num].pa_te;

                /* no this page_table */
                if (!pa_te)
                                return 0;

-              return pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt;
+             return pa_te[ix % g_pa_table_entry_num].ref_cnt;
 }

 int pa_cash_init()
 {
                void *pa_dir;
-              pa_dir = kzalloc(PA_DIR_SIZE, GFP_KERNEL);

+             g_max_pages_supported = MAX_PAGES_SUPPORTED;
+             g_pa_table_entry_num = PAGE_SIZE / PA_TABLE_ENTRY_SIZE;
+             g_pa_table_size = PA_TABLE_ENTRY_SIZE * g_pa_table_entry_num;
+             g_pa_dir_entry_num = g_max_pages_supported /g_pa_table_entry_num;
+             g_pa_dir_size = PA_DIR_ENTRY_SIZE * g_pa_dir_entry_num;
+
+             pa_dir = kzalloc(g_pa_dir_size, GFP_KERNEL);
+
                if (!pa_dir)
                                return -ENOMEM;
                g_cash.pa_dir = pa_dir;
-              g_cash.max_nr_pages = PA_TABLE_ENTRY_NUM * PA_DIR_ENTRY_NUM;
+             g_cash.max_nr_pages = g_pa_table_entry_num * g_pa_dir_entry_num;
                g_cash.free_list_hdr.Next = NULL;
                g_cash.cur_nr_pages = 0;
                g_cash.free_nr_pages = 0;
@@ -363,4 +372,4 @@
                mutex_init(&g_pa_mutex);
                return 0;
 }
-
+
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20101214/fcf2bdb8/attachment.html>


More information about the ofw mailing list