[ewg] [PATCH OFED-1.5] ehca: sync with mainline

Alexander Schmidt alexs at linux.vnet.ibm.com
Fri Jul 24 02:13:29 PDT 2009


Hi Vlad,

please apply the following patch for OFED-1.5 to get the latest fixes for the
ehca driver.

Thanks,
Alex

Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0010-replace_vmalloc.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0010-replace_vmalloc.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,43 @@
+commit bf31a1a02eb28d9bda0bb74345df7889faeb7335
+Author: Anton Blanchard <antonb at au1.ibm.com>
+Date:   Wed May 13 16:52:40 2009 -0700
+
+    IB/ehca: Replace vmalloc() with kmalloc() for queue allocation
+    
+    To improve performance of driver resource allocation, replace
+    vmalloc() calls with kmalloc().
+    
+    Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+index c3a3284..a260559 100644
+--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c
++++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+@@ -220,7 +220,7 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ 	queue->small_page = NULL;
+ 
+ 	/* allocate queue page pointers */
+-	queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *));
++	queue->queue_pages = kmalloc(nr_of_pages * sizeof(void *), GFP_KERNEL);
+ 	if (!queue->queue_pages) {
+ 		ehca_gen_err("Couldn't allocate queue page list");
+ 		return 0;
+@@ -240,7 +240,7 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ ipz_queue_ctor_exit0:
+ 	ehca_gen_err("Couldn't alloc pages queue=%p "
+ 		 "nr_of_pages=%x",  queue, nr_of_pages);
+-	vfree(queue->queue_pages);
++	kfree(queue->queue_pages);
+ 
+ 	return 0;
+ }
+@@ -262,7 +262,7 @@ int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+ 			free_page((unsigned long)queue->queue_pages[i]);
+ 	}
+ 
+-	vfree(queue->queue_pages);
++	kfree(queue->queue_pages);
+ 
+ 	return 1;
+ }
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0020-vmalloc_for_big_allocation.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0020-vmalloc_for_big_allocation.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,56 @@
+commit c94f156f63c835ffc02b686f9d4238b106f31a5d
+Author: Stefan Roscher <ossrosch at linux.vnet.ibm.com>
+Date:   Wed May 13 16:52:42 2009 -0700
+
+    IB/ehca: Fall back to vmalloc() for big allocations
+    
+    In case of large queue pairs there is the possibillity of allocation
+    failures due to memory fragmentation when using kmalloc().  To ensure
+    the memory is allocated even if kmalloc() can not find chunks which
+    are big enough, we fall back to allocating the memory with vmalloc().
+    
+    Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+index a260559..1227c59 100644
+--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c
++++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+@@ -222,8 +222,11 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ 	/* allocate queue page pointers */
+ 	queue->queue_pages = kmalloc(nr_of_pages * sizeof(void *), GFP_KERNEL);
+ 	if (!queue->queue_pages) {
+-		ehca_gen_err("Couldn't allocate queue page list");
+-		return 0;
++		queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *));
++		if (!queue->queue_pages) {
++			ehca_gen_err("Couldn't allocate queue page list");
++			return 0;
++		}
+ 	}
+ 	memset(queue->queue_pages, 0, nr_of_pages * sizeof(void *));
+ 
+@@ -240,7 +243,10 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ ipz_queue_ctor_exit0:
+ 	ehca_gen_err("Couldn't alloc pages queue=%p "
+ 		 "nr_of_pages=%x",  queue, nr_of_pages);
+-	kfree(queue->queue_pages);
++	if (is_vmalloc_addr(queue->queue_pages))
++		vfree(queue->queue_pages);
++	else
++		kfree(queue->queue_pages);
+ 
+ 	return 0;
+ }
+@@ -262,7 +268,10 @@ int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+ 			free_page((unsigned long)queue->queue_pages[i]);
+ 	}
+ 
+-	kfree(queue->queue_pages);
++	if (is_vmalloc_addr(queue->queue_pages))
++		vfree(queue->queue_pages);
++	else
++		kfree(queue->queue_pages);
+ 
+ 	return 1;
+ }
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0030-remove_allocation_for_user_qp.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0030-remove_allocation_for_user_qp.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,318 @@
+commit 1988d1fa1a9d642c5714a6afc9775fba0627f3ed
+Author: Stefan Roscher <ossrosch at linux.vnet.ibm.com>
+Date:   Wed May 13 16:52:43 2009 -0700
+
+    IB/ehca: Remove unnecessary memory operations for userspace queue pairs
+    
+    The queue map for flush completion circumvention is only used for
+    kernel space queue pairs.  This patch skips the allocation of the
+    queue maps in case the QP is created for userspace.  In addition, this
+    patch does not iomap the galpas for kernel usage if the queue pair is
+    only used in userspace.  These changes will improve the performance of
+    creation of userspace queue pairs.
+    
+    Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
+index 00c1081..ead4e71 100644
+--- a/drivers/infiniband/hw/ehca/ehca_qp.c
++++ b/drivers/infiniband/hw/ehca/ehca_qp.c
+@@ -461,7 +461,7 @@ static struct ehca_qp *internal_create_qp(
+ 					      ib_device);
+ 	struct ib_ucontext *context = NULL;
+ 	u64 h_ret;
+-	int is_llqp = 0, has_srq = 0;
++	int is_llqp = 0, has_srq = 0, is_user = 0;
+ 	int qp_type, max_send_sge, max_recv_sge, ret;
+ 
+ 	/* h_call's out parameters */
+@@ -609,9 +609,6 @@ static struct ehca_qp *internal_create_qp(
+ 		}
+ 	}
+ 
+-	if (pd->uobject && udata)
+-		context = pd->uobject->context;
+-
+ 	my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+ 	if (!my_qp) {
+ 		ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+@@ -619,6 +616,11 @@ static struct ehca_qp *internal_create_qp(
+ 		return ERR_PTR(-ENOMEM);
+ 	}
+ 
++	if (pd->uobject && udata) {
++		is_user = 1;
++		context = pd->uobject->context;
++	}
++
+ 	atomic_set(&my_qp->nr_events, 0);
+ 	init_waitqueue_head(&my_qp->wait_completion);
+ 	spin_lock_init(&my_qp->spinlock_s);
+@@ -707,7 +709,7 @@ static struct ehca_qp *internal_create_qp(
+ 			(parms.squeue.is_small || parms.rqueue.is_small);
+ 	}
+ 
+-	h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms);
++	h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+ 	if (h_ret != H_SUCCESS) {
+ 		ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+ 			 h_ret);
+@@ -769,18 +771,20 @@ static struct ehca_qp *internal_create_qp(
+ 			goto create_qp_exit2;
+ 		}
+ 
+-		my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+-			 my_qp->ipz_squeue.qe_size;
+-		my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+-					sizeof(struct ehca_qmap_entry));
+-		if (!my_qp->sq_map.map) {
+-			ehca_err(pd->device, "Couldn't allocate squeue "
+-				 "map ret=%i", ret);
+-			goto create_qp_exit3;
++		if (!is_user) {
++			my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
++				my_qp->ipz_squeue.qe_size;
++			my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
++						    sizeof(struct ehca_qmap_entry));
++			if (!my_qp->sq_map.map) {
++				ehca_err(pd->device, "Couldn't allocate squeue "
++					 "map ret=%i", ret);
++				goto create_qp_exit3;
++			}
++			INIT_LIST_HEAD(&my_qp->sq_err_node);
++			/* to avoid the generation of bogus flush CQEs */
++			reset_queue_map(&my_qp->sq_map);
+ 		}
+-		INIT_LIST_HEAD(&my_qp->sq_err_node);
+-		/* to avoid the generation of bogus flush CQEs */
+-		reset_queue_map(&my_qp->sq_map);
+ 	}
+ 
+ 	if (HAS_RQ(my_qp)) {
+@@ -792,20 +796,21 @@ static struct ehca_qp *internal_create_qp(
+ 				 "and pages ret=%i", ret);
+ 			goto create_qp_exit4;
+ 		}
+-
+-		my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+-			my_qp->ipz_rqueue.qe_size;
+-		my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+-				sizeof(struct ehca_qmap_entry));
+-		if (!my_qp->rq_map.map) {
+-			ehca_err(pd->device, "Couldn't allocate squeue "
+-					"map ret=%i", ret);
+-			goto create_qp_exit5;
++		if (!is_user) {
++			my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
++				my_qp->ipz_rqueue.qe_size;
++			my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
++						    sizeof(struct ehca_qmap_entry));
++			if (!my_qp->rq_map.map) {
++				ehca_err(pd->device, "Couldn't allocate squeue "
++					 "map ret=%i", ret);
++				goto create_qp_exit5;
++			}
++			INIT_LIST_HEAD(&my_qp->rq_err_node);
++			/* to avoid the generation of bogus flush CQEs */
++			reset_queue_map(&my_qp->rq_map);
+ 		}
+-		INIT_LIST_HEAD(&my_qp->rq_err_node);
+-		/* to avoid the generation of bogus flush CQEs */
+-		reset_queue_map(&my_qp->rq_map);
+-	} else if (init_attr->srq) {
++	} else if (init_attr->srq && !is_user) {
+ 		/* this is a base QP, use the queue map of the SRQ */
+ 		my_qp->rq_map = my_srq->rq_map;
+ 		INIT_LIST_HEAD(&my_qp->rq_err_node);
+@@ -918,7 +923,7 @@ create_qp_exit7:
+ 	kfree(my_qp->mod_qp_parm);
+ 
+ create_qp_exit6:
+-	if (HAS_RQ(my_qp))
++	if (HAS_RQ(my_qp) && !is_user)
+ 		vfree(my_qp->rq_map.map);
+ 
+ create_qp_exit5:
+@@ -926,7 +931,7 @@ create_qp_exit5:
+ 		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+ 
+ create_qp_exit4:
+-	if (HAS_SQ(my_qp))
++	if (HAS_SQ(my_qp) && !is_user)
+ 		vfree(my_qp->sq_map.map);
+ 
+ create_qp_exit3:
+@@ -1244,6 +1249,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ 	u64 update_mask;
+ 	u64 h_ret;
+ 	int bad_wqe_cnt = 0;
++	int is_user = 0;
+ 	int squeue_locked = 0;
+ 	unsigned long flags = 0;
+ 
+@@ -1266,6 +1272,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ 		ret = ehca2ib_return_code(h_ret);
+ 		goto modify_qp_exit1;
+ 	}
++	if (ibqp->uobject)
++		is_user = 1;
+ 
+ 	qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+ 
+@@ -1728,7 +1736,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ 			goto modify_qp_exit2;
+ 		}
+ 	}
+-	if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)) {
++	if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
++	    && !is_user) {
+ 		ret = check_for_left_cqes(my_qp, shca);
+ 		if (ret)
+ 			goto modify_qp_exit2;
+@@ -1738,16 +1747,17 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ 		ipz_qeit_reset(&my_qp->ipz_rqueue);
+ 		ipz_qeit_reset(&my_qp->ipz_squeue);
+ 
+-		if (qp_cur_state == IB_QPS_ERR) {
++		if (qp_cur_state == IB_QPS_ERR && !is_user) {
+ 			del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+ 
+ 			if (HAS_RQ(my_qp))
+ 				del_from_err_list(my_qp->recv_cq,
+ 						  &my_qp->rq_err_node);
+ 		}
+-		reset_queue_map(&my_qp->sq_map);
++		if (!is_user)
++			reset_queue_map(&my_qp->sq_map);
+ 
+-		if (HAS_RQ(my_qp))
++		if (HAS_RQ(my_qp) && !is_user)
+ 			reset_queue_map(&my_qp->rq_map);
+ 	}
+ 
+@@ -2138,10 +2148,12 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ 	int ret;
+ 	u64 h_ret;
+ 	u8 port_num;
++	int is_user = 0;
+ 	enum ib_qp_type	qp_type;
+ 	unsigned long flags;
+ 
+ 	if (uobject) {
++		is_user = 1;
+ 		if (my_qp->mm_count_galpa ||
+ 		    my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+ 			ehca_err(dev, "Resources still referenced in "
+@@ -2168,10 +2180,10 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ 	 * SRQs will never get into an error list and do not have a recv_cq,
+ 	 * so we need to skip them here.
+ 	 */
+-	if (HAS_RQ(my_qp) && !IS_SRQ(my_qp))
++	if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+ 		del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+ 
+-	if (HAS_SQ(my_qp))
++	if (HAS_SQ(my_qp) && !is_user)
+ 		del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+ 
+ 	/* now wait until all pending events have completed */
+@@ -2209,13 +2221,13 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ 
+ 	if (HAS_RQ(my_qp)) {
+ 		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+-
+-		vfree(my_qp->rq_map.map);
++		if (!is_user)
++			vfree(my_qp->rq_map.map);
+ 	}
+ 	if (HAS_SQ(my_qp)) {
+ 		ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+-
+-		vfree(my_qp->sq_map.map);
++		if (!is_user)
++			vfree(my_qp->sq_map.map);
+ 	}
+ 	kmem_cache_free(qp_cache, my_qp);
+ 	atomic_dec(&shca->num_qps);
+diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
+index d0ab0c0..4d5dc33 100644
+--- a/drivers/infiniband/hw/ehca/hcp_if.c
++++ b/drivers/infiniband/hw/ehca/hcp_if.c
+@@ -284,7 +284,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ 	param->act_pages = (u32)outs[4];
+ 
+ 	if (ret == H_SUCCESS)
+-		hcp_galpas_ctor(&cq->galpas, outs[5], outs[6]);
++		hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+ 
+ 	if (ret == H_NOT_ENOUGH_RESOURCES)
+ 		ehca_gen_err("Not enough resources. ret=%lli", ret);
+@@ -293,7 +293,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ }
+ 
+ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+-			     struct ehca_alloc_qp_parms *parms)
++			     struct ehca_alloc_qp_parms *parms, int is_user)
+ {
+ 	u64 ret;
+ 	u64 allocate_controls, max_r10_reg, r11, r12;
+@@ -359,7 +359,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+ 		(u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+ 
+ 	if (ret == H_SUCCESS)
+-		hcp_galpas_ctor(&parms->galpas, outs[6], outs[6]);
++		hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+ 
+ 	if (ret == H_NOT_ENOUGH_RESOURCES)
+ 		ehca_gen_err("Not enough resources. ret=%lli", ret);
+diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
+index 2c3c6e0..39c1c36 100644
+--- a/drivers/infiniband/hw/ehca/hcp_if.h
++++ b/drivers/infiniband/hw/ehca/hcp_if.h
+@@ -78,7 +78,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+  * initialize resources, create empty QPPTs (2 rings).
+  */
+ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+-			     struct ehca_alloc_qp_parms *parms);
++			     struct ehca_alloc_qp_parms *parms, int is_user);
+ 
+ u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+ 		      const u8 port_id,
+diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c
+index 2148210..b3e0e72 100644
+--- a/drivers/infiniband/hw/ehca/hcp_phyp.c
++++ b/drivers/infiniband/hw/ehca/hcp_phyp.c
+@@ -54,12 +54,15 @@ int hcall_unmap_page(u64 mapaddr)
+ 	return 0;
+ }
+ 
+-int hcp_galpas_ctor(struct h_galpas *galpas,
++int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+ 		    u64 paddr_kernel, u64 paddr_user)
+ {
+-	int ret = hcall_map_page(paddr_kernel, &galpas->kernel.fw_handle);
+-	if (ret)
+-		return ret;
++	if (!is_user) {
++		int ret = hcall_map_page(paddr_kernel, &galpas->kernel.fw_handle);
++		if (ret)
++			return ret;
++	} else
++		galpas->kernel.fw_handle = 0;
+ 
+ 	galpas->user.fw_handle = paddr_user;
+ 
+diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h
+index 5305c2a..204227d 100644
+--- a/drivers/infiniband/hw/ehca/hcp_phyp.h
++++ b/drivers/infiniband/hw/ehca/hcp_phyp.h
+@@ -78,7 +78,7 @@ static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+ 	*(volatile u64 __force *)addr = value;
+ }
+ 
+-int hcp_galpas_ctor(struct h_galpas *galpas,
++int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+ 		    u64 paddr_kernel, u64 paddr_user);
+ 
+ int hcp_galpas_dtor(struct h_galpas *galpas);
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0040-remove_bitmask_macros.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0040-remove_bitmask_macros.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,132 @@
+commit 25a52393270ca48c7d0848672ad4423313033c3d
+Author: Joachim Fenkes <fenkes at de.ibm.com>
+Date:   Wed Jun 3 13:25:42 2009 -0700
+
+    IB/ehca: Remove superfluous bitmasks from QP control block
+    
+    All the fields in the control block are nicely right-aligned, so no
+    masking is necessary.
+    
+    Signed-off-by: Joachim Fenkes <fenkes at de.ibm.com>
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
+index 1798e64..689c357 100644
+--- a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
++++ b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
+@@ -165,7 +165,6 @@ struct hcp_modify_qp_control_block {
+ #define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
+ #define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
+ #define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
+-#define MQPCB_QP_STATE                          EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
+ #define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
+ #define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
+@@ -176,60 +175,33 @@ struct hcp_modify_qp_control_block {
+ #define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
+ #define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
+ #define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
+-#define MQPCB_PATH_MTU                          EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
+-#define MQPCB_MAX_STATIC_RATE                   EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
+-#define MQPCB_DLID                              EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
+-#define MQPCB_RNR_RETRY_COUNT                   EHCA_BMASK_IBM(29, 31)
+ #define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
+-#define MQPCB_SOURCE_PATH_BITS                  EHCA_BMASK_IBM(25, 31)
+ #define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
+-#define MQPCB_TRAFFIC_CLASS                     EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
+-#define MQPCB_HOP_LIMIT                         EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
+-#define MQPCB_SOURCE_GID_IDX                    EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
+-#define MQPCB_FLOW_LABEL                        EHCA_BMASK_IBM(12, 31)
+ #define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
+ #define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
+-#define MQPCB_SERVICE_LEVEL_AL                  EHCA_BMASK_IBM(28, 31)
+ #define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
+-#define MQPCB_SEND_GRH_FLAG_AL                  EHCA_BMASK_IBM(31, 31)
+ #define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
+-#define MQPCB_RETRY_COUNT_AL                    EHCA_BMASK_IBM(29, 31)
+ #define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
+-#define MQPCB_TIMEOUT_AL                        EHCA_BMASK_IBM(27, 31)
+ #define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
+-#define MQPCB_MAX_STATIC_RATE_AL                EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
+-#define MQPCB_DLID_AL                           EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
+-#define MQPCB_RNR_RETRY_COUNT_AL                EHCA_BMASK_IBM(29, 31)
+ #define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
+-#define MQPCB_SOURCE_PATH_BITS_AL               EHCA_BMASK_IBM(25, 31)
+ #define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
+-#define MQPCB_TRAFFIC_CLASS_AL                  EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
+-#define MQPCB_HOP_LIMIT_AL                      EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
+-#define MQPCB_SOURCE_GID_IDX_AL                 EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
+-#define MQPCB_FLOW_LABEL_AL                     EHCA_BMASK_IBM(12, 31)
+ #define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
+ #define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
+-#define MQPCB_MAX_NR_OUTST_SEND_WR              EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
+-#define MQPCB_MAX_NR_OUTST_RECV_WR              EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
+-#define MQPCB_DISABLE_ETE_CREDIT_CHECK          EHCA_BMASK_IBM(31, 31)
+-#define MQPCB_QP_NUMBER                         EHCA_BMASK_IBM( 8, 31)
+ #define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
+-#define MQPCB_QP_ENABLE                         EHCA_BMASK_IBM(31, 31)
+ #define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
+-#define MQPCB_CURR_SRQ_LIMIT                    EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
+ #define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
+ 
+diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
+index ead4e71..0338f1f 100644
+--- a/drivers/infiniband/hw/ehca/ehca_qp.c
++++ b/drivers/infiniband/hw/ehca/ehca_qp.c
+@@ -1962,19 +1962,13 @@ int ehca_query_qp(struct ib_qp *qp,
+ 	qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+ 	qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+ 
+-	qp_attr->pkey_index =
+-		EHCA_BMASK_GET(MQPCB_PRIM_P_KEY_IDX, qpcb->prim_p_key_idx);
+-
+-	qp_attr->port_num =
+-		EHCA_BMASK_GET(MQPCB_PRIM_PHYS_PORT, qpcb->prim_phys_port);
+-
++	qp_attr->pkey_index = qpcb->prim_p_key_idx;
++	qp_attr->port_num = qpcb->prim_phys_port;
+ 	qp_attr->timeout = qpcb->timeout;
+ 	qp_attr->retry_cnt = qpcb->retry_count;
+ 	qp_attr->rnr_retry = qpcb->rnr_retry_count;
+ 
+-	qp_attr->alt_pkey_index =
+-		EHCA_BMASK_GET(MQPCB_PRIM_P_KEY_IDX, qpcb->alt_p_key_idx);
+-
++	qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+ 	qp_attr->alt_port_num = qpcb->alt_phys_port;
+ 	qp_attr->alt_timeout = qpcb->timeout_al;
+ 
+@@ -2061,8 +2055,7 @@ int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ 		update_mask |=
+ 			EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+ 			| EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+-		mqpcb->curr_srq_limit =
+-			EHCA_BMASK_SET(MQPCB_CURR_SRQ_LIMIT, attr->srq_limit);
++		mqpcb->curr_srq_limit = attr->srq_limit;
+ 		mqpcb->qp_aff_asyn_ev_log_reg =
+ 			EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+ 	}
+@@ -2125,8 +2118,7 @@ int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+ 
+ 	srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+ 	srq_attr->max_sge = 3;
+-	srq_attr->srq_limit = EHCA_BMASK_GET(
+-		MQPCB_CURR_SRQ_LIMIT, qpcb->curr_srq_limit);
++	srq_attr->srq_limit = qpcb->curr_srq_limit;
+ 
+ 	if (ehca_debug_level >= 2)
+ 		ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0050-dmem_toleration.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0050-dmem_toleration.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,745 @@
+commit 0cf89dcdbc53f2b43e4ce7419b6ff47f4309c2eb
+Author: Hannes Hering <hering2 at de.ibm.com>
+Date:   Mon Jun 22 22:18:51 2009 -0700
+
+    IB/ehca: Tolerate dynamic memory operations before driver load
+    
+    Implement toleration of dynamic memory operations and 16 GB gigantic
+    pages, where "toleration" means that the driver can cope with dynamic
+    memory operations that happen before the driver is loaded.  While the
+    ehca driver is loaded, dynamic memory operations are still prohibited
+    by returning NOTIFY_BAD from the memory notifier.
+    
+    On module load the driver walks through available system memory,
+    checks for available memory ranges and then registers the kernel
+    internal memory region accordingly.  The translation of address ranges
+    is implemented via a 3-level busmap.
+    
+    Signed-off-by: Hannes Hering <hering2 at de.ibm.com>
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
+index ce4e6ef..14a18b7 100644
+--- a/drivers/infiniband/hw/ehca/ehca_main.c
++++ b/drivers/infiniband/hw/ehca/ehca_main.c
+@@ -506,6 +506,7 @@ static int ehca_init_device(struct ehca_shca *shca)
+ 	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
+ 	shca->ib_device.process_mad	    = ehca_process_mad;
+ 	shca->ib_device.mmap		    = ehca_mmap;
++	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
+ 
+ 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+ 		shca->ib_device.uverbs_cmd_mask |=
+@@ -1028,17 +1029,23 @@ static int __init ehca_module_init(void)
+ 		goto module_init1;
+ 	}
+ 
++	ret = ehca_create_busmap();
++	if (ret) {
++		ehca_gen_err("Cannot create busmap.");
++		goto module_init2;
++	}
++
+ 	ret = ibmebus_register_driver(&ehca_driver);
+ 	if (ret) {
+ 		ehca_gen_err("Cannot register eHCA device driver");
+ 		ret = -EINVAL;
+-		goto module_init2;
++		goto module_init3;
+ 	}
+ 
+ 	ret = register_memory_notifier(&ehca_mem_nb);
+ 	if (ret) {
+ 		ehca_gen_err("Failed registering memory add/remove notifier");
+-		goto module_init3;
++		goto module_init4;
+ 	}
+ 
+ 	if (ehca_poll_all_eqs != 1) {
+@@ -1053,9 +1060,12 @@ static int __init ehca_module_init(void)
+ 
+ 	return 0;
+ 
+-module_init3:
++module_init4:
+ 	ibmebus_unregister_driver(&ehca_driver);
+ 
++module_init3:
++	ehca_destroy_busmap();
++
+ module_init2:
+ 	ehca_destroy_slab_caches();
+ 
+@@ -1073,6 +1083,8 @@ static void __exit ehca_module_exit(void)
+ 
+ 	unregister_memory_notifier(&ehca_mem_nb);
+ 
++	ehca_destroy_busmap();
++
+ 	ehca_destroy_slab_caches();
+ 
+ 	ehca_destroy_comp_pool();
+diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
+index 72f83f7..7663a2a 100644
+--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
++++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
+@@ -53,6 +53,38 @@
+ /* max number of rpages (per hcall register_rpages) */
+ #define MAX_RPAGES 512
+ 
++/* DMEM toleration management */
++#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
++#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
++#define EHCA_HUGEPAGESHIFT     34
++#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
++#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
++#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
++#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
++#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
++#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
++#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
++#define EHCA_DIR_MAP_SIZE (0x10000)
++#define EHCA_ENT_MAP_SIZE (0x10000)
++#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
++
++static unsigned long ehca_mr_len;
++
++/*
++ * Memory map data structures
++ */
++struct ehca_dir_bmap {
++	u64 ent[EHCA_MAP_ENTRIES];
++};
++struct ehca_top_bmap {
++	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
++};
++struct ehca_bmap {
++	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
++};
++
++static struct ehca_bmap *ehca_bmap;
++
+ static struct kmem_cache *mr_cache;
+ static struct kmem_cache *mw_cache;
+ 
+@@ -68,6 +100,8 @@ enum ehca_mr_pgsize {
+ #define EHCA_MR_PGSHIFT1M  20
+ #define EHCA_MR_PGSHIFT16M 24
+ 
++static u64 ehca_map_vaddr(void *caddr);
++
+ static u32 ehca_encode_hwpage_size(u32 pgsize)
+ {
+ 	int log = ilog2(pgsize);
+@@ -135,7 +169,8 @@ struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+ 			goto get_dma_mr_exit0;
+ 		}
+ 
+-		ret = ehca_reg_maxmr(shca, e_maxmr, (u64 *)KERNELBASE,
++		ret = ehca_reg_maxmr(shca, e_maxmr,
++				     (void *)ehca_map_vaddr((void *)KERNELBASE),
+ 				     mr_access_flags, e_pd,
+ 				     &e_maxmr->ib.ib_mr.lkey,
+ 				     &e_maxmr->ib.ib_mr.rkey);
+@@ -251,7 +286,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+ 
+ 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+ 				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+-				  &e_mr->ib.ib_mr.rkey);
++				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+ 		if (ret) {
+ 			ib_mr = ERR_PTR(ret);
+ 			goto reg_phys_mr_exit1;
+@@ -370,7 +405,7 @@ reg_user_mr_fallback:
+ 
+ 	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+ 			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+-			  &e_mr->ib.ib_mr.rkey);
++			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+ 	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+ 		ehca_warn(pd->device, "failed to register mr "
+ 			  "with hwpage_size=%llx", hwpage_size);
+@@ -794,7 +829,7 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+ 	ret = ehca_reg_mr(shca, e_fmr, NULL,
+ 			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+ 			  mr_access_flags, e_pd, &pginfo,
+-			  &tmp_lkey, &tmp_rkey);
++			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+ 	if (ret) {
+ 		ib_fmr = ERR_PTR(ret);
+ 		goto alloc_fmr_exit1;
+@@ -983,6 +1018,10 @@ free_fmr_exit0:
+ 
+ /*----------------------------------------------------------------------*/
+ 
++static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
++				   struct ehca_mr *e_mr,
++				   struct ehca_mr_pginfo *pginfo);
++
+ int ehca_reg_mr(struct ehca_shca *shca,
+ 		struct ehca_mr *e_mr,
+ 		u64 *iova_start,
+@@ -991,7 +1030,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
+ 		struct ehca_pd *e_pd,
+ 		struct ehca_mr_pginfo *pginfo,
+ 		u32 *lkey, /*OUT*/
+-		u32 *rkey) /*OUT*/
++		u32 *rkey, /*OUT*/
++		enum ehca_reg_type reg_type)
+ {
+ 	int ret;
+ 	u64 h_ret;
+@@ -1015,7 +1055,13 @@ int ehca_reg_mr(struct ehca_shca *shca,
+ 
+ 	e_mr->ipz_mr_handle = hipzout.handle;
+ 
+-	ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
++	if (reg_type == EHCA_REG_BUSMAP_MR)
++		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
++	else if (reg_type == EHCA_REG_MR)
++		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
++	else
++		ret = -EINVAL;
++
+ 	if (ret)
+ 		goto ehca_reg_mr_exit1;
+ 
+@@ -1316,7 +1362,7 @@ int ehca_rereg_mr(struct ehca_shca *shca,
+ 		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+ 
+ 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+-				  e_pd, pginfo, lkey, rkey);
++				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+ 		if (ret) {
+ 			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+ 			memcpy(&e_mr->flags, &(save_mr.flags),
+@@ -1409,7 +1455,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca,
+ 	ret = ehca_reg_mr(shca, e_fmr, NULL,
+ 			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+ 			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+-			  &tmp_rkey);
++			  &tmp_rkey, EHCA_REG_MR);
+ 	if (ret) {
+ 		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+ 		memcpy(&e_fmr->flags, &(save_mr.flags),
+@@ -1478,6 +1524,90 @@ ehca_reg_smr_exit0:
+ } /* end ehca_reg_smr() */
+ 
+ /*----------------------------------------------------------------------*/
++static inline void *ehca_calc_sectbase(int top, int dir, int idx)
++{
++	unsigned long ret = idx;
++	ret |= dir << EHCA_DIR_INDEX_SHIFT;
++	ret |= top << EHCA_TOP_INDEX_SHIFT;
++	return abs_to_virt(ret << SECTION_SIZE_BITS);
++}
++
++#define ehca_bmap_valid(entry) \
++	((u64)entry != (u64)EHCA_INVAL_ADDR)
++
++static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
++			       struct ehca_shca *shca, struct ehca_mr *mr,
++			       struct ehca_mr_pginfo *pginfo)
++{
++	u64 h_ret = 0;
++	unsigned long page = 0;
++	u64 rpage = virt_to_abs(kpage);
++	int page_count;
++
++	void *sectbase = ehca_calc_sectbase(top, dir, idx);
++	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
++		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
++					   "hwpage_size does not fit to "
++					   "section start address");
++	}
++	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
++
++	while (page < page_count) {
++		u64 rnum;
++		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
++		     rnum++) {
++			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
++			kpage[rnum] = virt_to_abs(pg);
++		}
++
++		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
++			ehca_encode_hwpage_size(pginfo->hwpage_size),
++			0, rpage, rnum);
++
++		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
++			ehca_err(&shca->ib_device, "register_rpage_mr failed");
++			return h_ret;
++		}
++	}
++	return h_ret;
++}
++
++static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
++				struct ehca_shca *shca, struct ehca_mr *mr,
++				struct ehca_mr_pginfo *pginfo)
++{
++	u64 hret = H_SUCCESS;
++	int idx;
++
++	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
++		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
++			continue;
++
++		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
++					   pginfo);
++		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
++				return hret;
++	}
++	return hret;
++}
++
++static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
++				    struct ehca_mr *mr,
++				    struct ehca_mr_pginfo *pginfo)
++{
++	u64 hret = H_SUCCESS;
++	int dir;
++
++	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
++		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
++			continue;
++
++		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
++		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
++				return hret;
++	}
++	return hret;
++}
+ 
+ /* register internal max-MR to internal SHCA */
+ int ehca_reg_internal_maxmr(
+@@ -1495,6 +1625,11 @@ int ehca_reg_internal_maxmr(
+ 	u32 num_hwpages;
+ 	u64 hw_pgsize;
+ 
++	if (!ehca_bmap) {
++		ret = -EFAULT;
++		goto ehca_reg_internal_maxmr_exit0;
++	}
++
+ 	e_mr = ehca_mr_new();
+ 	if (!e_mr) {
+ 		ehca_err(&shca->ib_device, "out of memory");
+@@ -1504,8 +1639,8 @@ int ehca_reg_internal_maxmr(
+ 	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+ 
+ 	/* register internal max-MR on HCA */
+-	size_maxmr = (u64)high_memory - PAGE_OFFSET;
+-	iova_start = (u64 *)KERNELBASE;
++	size_maxmr = ehca_mr_len;
++	iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE);
+ 	ib_pbuf.addr = 0;
+ 	ib_pbuf.size = size_maxmr;
+ 	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+@@ -1524,7 +1659,7 @@ int ehca_reg_internal_maxmr(
+ 
+ 	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+ 			  &pginfo, &e_mr->ib.ib_mr.lkey,
+-			  &e_mr->ib.ib_mr.rkey);
++			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+ 	if (ret) {
+ 		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+ 			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+@@ -2077,8 +2212,8 @@ int ehca_mr_is_maxmr(u64 size,
+ 		     u64 *iova_start)
+ {
+ 	/* a MR is treated as max-MR only if it fits following: */
+-	if ((size == ((u64)high_memory - PAGE_OFFSET)) &&
+-	    (iova_start == (void *)KERNELBASE)) {
++	if ((size == ehca_mr_len) &&
++	    (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) {
+ 		ehca_gen_dbg("this is a max-MR");
+ 		return 1;
+ 	} else
+@@ -2184,3 +2319,350 @@ void ehca_cleanup_mrmw_cache(void)
+ 	if (mw_cache)
+ 		kmem_cache_destroy(mw_cache);
+ }
++
++static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
++				     int dir)
++{
++	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
++		ehca_top_bmap->dir[dir] =
++			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
++		if (!ehca_top_bmap->dir[dir])
++			return -ENOMEM;
++		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
++		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
++	}
++	return 0;
++}
++
++static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
++{
++	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
++		ehca_bmap->top[top] =
++			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
++		if (!ehca_bmap->top[top])
++			return -ENOMEM;
++		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
++		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
++	}
++	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
++}
++
++static inline int ehca_calc_index(unsigned long i, unsigned long s)
++{
++	return (i >> s) & EHCA_INDEX_MASK;
++}
++
++void ehca_destroy_busmap(void)
++{
++	int top, dir;
++
++	if (!ehca_bmap)
++		return;
++
++	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
++		if (!ehca_bmap_valid(ehca_bmap->top[top]))
++			continue;
++		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
++			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
++				continue;
++
++			kfree(ehca_bmap->top[top]->dir[dir]);
++		}
++
++		kfree(ehca_bmap->top[top]);
++	}
++
++	kfree(ehca_bmap);
++	ehca_bmap = NULL;
++}
++
++static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
++{
++	unsigned long i, start_section, end_section;
++	int top, dir, idx;
++
++	if (!nr_pages)
++		return 0;
++
++	if (!ehca_bmap) {
++		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
++		if (!ehca_bmap)
++			return -ENOMEM;
++		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
++		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
++	}
++
++	start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
++	end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
++	for (i = start_section; i < end_section; i++) {
++		int ret;
++		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
++		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
++		idx = i & EHCA_INDEX_MASK;
++
++		ret = ehca_init_bmap(ehca_bmap, top, dir);
++		if (ret) {
++			ehca_destroy_busmap();
++			return ret;
++		}
++		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
++		ehca_mr_len += EHCA_SECTSIZE;
++	}
++	return 0;
++}
++
++static int ehca_is_hugepage(unsigned long pfn)
++{
++	int page_order;
++
++	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
++		return 0;
++
++	page_order = compound_order(pfn_to_page(pfn));
++	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
++		return 0;
++
++	return 1;
++}
++
++static int ehca_create_busmap_callback(unsigned long initial_pfn,
++				       unsigned long total_nr_pages, void *arg)
++{
++	int ret;
++	unsigned long pfn, start_pfn, end_pfn, nr_pages;
++
++	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
++		return ehca_update_busmap(initial_pfn, total_nr_pages);
++
++	/* Given chunk is >= 16GB -> check for hugepages */
++	start_pfn = initial_pfn;
++	end_pfn = initial_pfn + total_nr_pages;
++	pfn = start_pfn;
++
++	while (pfn < end_pfn) {
++		if (ehca_is_hugepage(pfn)) {
++			/* Add mem found in front of the hugepage */
++			nr_pages = pfn - start_pfn;
++			ret = ehca_update_busmap(start_pfn, nr_pages);
++			if (ret)
++				return ret;
++			/* Skip the hugepage */
++			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
++			start_pfn = pfn;
++		} else
++			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
++	}
++
++	/* Add mem found behind the hugepage(s)  */
++	nr_pages = pfn - start_pfn;
++	return ehca_update_busmap(start_pfn, nr_pages);
++}
++
++int ehca_create_busmap(void)
++{
++	int ret;
++
++	ehca_mr_len = 0;
++	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
++				   ehca_create_busmap_callback);
++	return ret;
++}
++
++static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
++				   struct ehca_mr *e_mr,
++				   struct ehca_mr_pginfo *pginfo)
++{
++	int top;
++	u64 hret, *kpage;
++
++	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
++	if (!kpage) {
++		ehca_err(&shca->ib_device, "kpage alloc failed");
++		return -ENOMEM;
++	}
++	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
++		if (!ehca_bmap_valid(ehca_bmap->top[top]))
++			continue;
++		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
++		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
++			break;
++	}
++
++	ehca_free_fw_ctrlblock(kpage);
++
++	if (hret == H_SUCCESS)
++		return 0; /* Everything is fine */
++	else {
++		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
++				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
++				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
++				 e_mr->ib.ib_mr.lkey,
++				 shca->ipz_hca_handle.handle,
++				 e_mr->ipz_mr_handle.handle);
++		return ehca2ib_return_code(hret);
++	}
++}
++
++static u64 ehca_map_vaddr(void *caddr)
++{
++	int top, dir, idx;
++	unsigned long abs_addr, offset;
++	u64 entry;
++
++	if (!ehca_bmap)
++		return EHCA_INVAL_ADDR;
++
++	abs_addr = virt_to_abs(caddr);
++	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
++	if (!ehca_bmap_valid(ehca_bmap->top[top]))
++		return EHCA_INVAL_ADDR;
++
++	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
++	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
++		return EHCA_INVAL_ADDR;
++
++	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
++
++	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
++	if (ehca_bmap_valid(entry)) {
++		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
++		return entry | offset;
++	} else
++		return EHCA_INVAL_ADDR;
++}
++
++static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
++{
++	return dma_addr == EHCA_INVAL_ADDR;
++}
++
++static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
++			       size_t size, enum dma_data_direction direction)
++{
++	if (cpu_addr)
++		return ehca_map_vaddr(cpu_addr);
++	else
++		return EHCA_INVAL_ADDR;
++}
++
++static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
++				  enum dma_data_direction direction)
++{
++	/* This is only a stub; nothing to be done here */
++}
++
++static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
++			     unsigned long offset, size_t size,
++			     enum dma_data_direction direction)
++{
++	u64 addr;
++
++	if (offset + size > PAGE_SIZE)
++		return EHCA_INVAL_ADDR;
++
++	addr = ehca_map_vaddr(page_address(page));
++	if (!ehca_dma_mapping_error(dev, addr))
++		addr += offset;
++
++	return addr;
++}
++
++static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
++				enum dma_data_direction direction)
++{
++	/* This is only a stub; nothing to be done here */
++}
++
++static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
++			   int nents, enum dma_data_direction direction)
++{
++	struct scatterlist *sg;
++	int i;
++
++	for_each_sg(sgl, sg, nents, i) {
++		u64 addr;
++		addr = ehca_map_vaddr(sg_virt(sg));
++		if (ehca_dma_mapping_error(dev, addr))
++			return 0;
++
++		sg->dma_address = addr;
++		sg->dma_length = sg->length;
++	}
++	return nents;
++}
++
++static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
++			      int nents, enum dma_data_direction direction)
++{
++	/* This is only a stub; nothing to be done here */
++}
++
++static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg)
++{
++	return sg->dma_address;
++}
++
++static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg)
++{
++	return sg->length;
++}
++
++static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
++					 size_t size,
++					 enum dma_data_direction dir)
++{
++	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
++}
++
++static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
++					    size_t size,
++					    enum dma_data_direction dir)
++{
++	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
++}
++
++static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
++				     u64 *dma_handle, gfp_t flag)
++{
++	struct page *p;
++	void *addr = NULL;
++	u64 dma_addr;
++
++	p = alloc_pages(flag, get_order(size));
++	if (p) {
++		addr = page_address(p);
++		dma_addr = ehca_map_vaddr(addr);
++		if (ehca_dma_mapping_error(dev, dma_addr)) {
++			free_pages((unsigned long)addr,	get_order(size));
++			return NULL;
++		}
++		if (dma_handle)
++			*dma_handle = dma_addr;
++		return addr;
++	}
++	return NULL;
++}
++
++static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
++				   void *cpu_addr, u64 dma_handle)
++{
++	if (cpu_addr && size)
++		free_pages((unsigned long)cpu_addr, get_order(size));
++}
++
++
++struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
++	.mapping_error          = ehca_dma_mapping_error,
++	.map_single             = ehca_dma_map_single,
++	.unmap_single           = ehca_dma_unmap_single,
++	.map_page               = ehca_dma_map_page,
++	.unmap_page             = ehca_dma_unmap_page,
++	.map_sg                 = ehca_dma_map_sg,
++	.unmap_sg               = ehca_dma_unmap_sg,
++	.dma_address            = ehca_dma_address,
++	.dma_len                = ehca_dma_len,
++	.sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
++	.sync_single_for_device = ehca_dma_sync_single_for_device,
++	.alloc_coherent         = ehca_dma_alloc_coherent,
++	.free_coherent          = ehca_dma_free_coherent,
++};
+diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h
+index bc8f4e3..50d8b51 100644
+--- a/drivers/infiniband/hw/ehca/ehca_mrmw.h
++++ b/drivers/infiniband/hw/ehca/ehca_mrmw.h
+@@ -42,6 +42,11 @@
+ #ifndef _EHCA_MRMW_H_
+ #define _EHCA_MRMW_H_
+ 
++enum ehca_reg_type {
++	EHCA_REG_MR,
++	EHCA_REG_BUSMAP_MR
++};
++
+ int ehca_reg_mr(struct ehca_shca *shca,
+ 		struct ehca_mr *e_mr,
+ 		u64 *iova_start,
+@@ -50,7 +55,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
+ 		struct ehca_pd *e_pd,
+ 		struct ehca_mr_pginfo *pginfo,
+ 		u32 *lkey,
+-		u32 *rkey);
++		u32 *rkey,
++		enum ehca_reg_type reg_type);
+ 
+ int ehca_reg_mr_rpages(struct ehca_shca *shca,
+ 		       struct ehca_mr *e_mr,
+@@ -118,4 +124,9 @@ void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+ 
+ void ehca_mr_deletenew(struct ehca_mr *mr);
+ 
++int ehca_create_busmap(void);
++
++void ehca_destroy_busmap(void);
++
++extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+ #endif  /*_EHCA_MRMW_H_*/
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0060-bump_version.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0060-bump_version.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,24 @@
+commit 1d4d6da535be97b710e87a33c4828c97c36eee21
+Author: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+Date:   Tue Jun 23 10:30:04 2009 -0700
+
+    IB/ehca: Bump version number
+    
+    Increment version number for DMEM toleration.
+    
+    Signed-off-by: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+Index: ofa_kernel-1.5/drivers/infiniband/hw/ehca/ehca_main.c
+===================================================================
+--- ofa_kernel-1.5.orig/drivers/infiniband/hw/ehca/ehca_main.c	2009-07-24 04:33:15.000000000 -0400
++++ ofa_kernel-1.5/drivers/infiniband/hw/ehca/ehca_main.c	2009-07-24 04:34:44.000000000 -0400
+@@ -52,7 +52,7 @@
+ #include "ehca_tools.h"
+ #include "hcp_if.h"
+ 
+-#define HCAD_VERSION "0026"
++#define HCAD_VERSION "0028"
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_AUTHOR("Christoph Raisch <raisch at de.ibm.com>");
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0070-port_autodetect_as_default.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0070-port_autodetect_as_default.patch	2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,45 @@
+commit 0e71ff3afd4229862da6be540adc0d8de18187ea
+Author: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+Date:   Tue Jul 14 21:37:59 2009 -0700
+
+    IB/ehca: Make port autodetect mode the default
+    
+    Make port autodetect mode the default for the ehca driver. The
+    autodetect code has been in the kernel for several releases now and
+    has proved to be stable.
+    
+    Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
+index fab18a2..5b635aa 100644
+--- a/drivers/infiniband/hw/ehca/ehca_main.c
++++ b/drivers/infiniband/hw/ehca/ehca_main.c
+@@ -52,7 +52,7 @@
+ #include "ehca_tools.h"
+ #include "hcp_if.h"
+ 
+-#define HCAD_VERSION "0028"
++#define HCAD_VERSION "0029"
+ 
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_AUTHOR("Christoph Raisch <raisch at de.ibm.com>");
+@@ -64,7 +64,7 @@ static int ehca_hw_level      = 0;
+ static int ehca_poll_all_eqs  = 1;
+ 
+ int ehca_debug_level   = 0;
+-int ehca_nr_ports      = 2;
++int ehca_nr_ports      = -1;
+ int ehca_use_hp_mr     = 0;
+ int ehca_port_act_time = 30;
+ int ehca_static_rate   = -1;
+@@ -95,8 +95,8 @@ MODULE_PARM_DESC(hw_level,
+ 		 "Hardware level (0: autosensing (default), "
+ 		 "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+ MODULE_PARM_DESC(nr_ports,
+-		 "number of connected ports (-1: autodetect, 1: port one only, "
+-		 "2: two ports (default)");
++		 "number of connected ports (-1: autodetect (default), "
++		 "1: port one only, 2: two ports)");
+ MODULE_PARM_DESC(use_hp_mr,
+ 		 "Use high performance MRs (default: no)");
+ MODULE_PARM_DESC(port_act_time,



More information about the ewg mailing list