[ewg] [PATCH OFED-1.5] ehca: sync with mainline
Alexander Schmidt
alexs at linux.vnet.ibm.com
Fri Jul 24 02:13:29 PDT 2009
Hi Vlad,
please apply the following patch for OFED-1.5 to get the latest fixes for the
ehca driver.
Thanks,
Alex
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0010-replace_vmalloc.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0010-replace_vmalloc.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,43 @@
+commit bf31a1a02eb28d9bda0bb74345df7889faeb7335
+Author: Anton Blanchard <antonb at au1.ibm.com>
+Date: Wed May 13 16:52:40 2009 -0700
+
+ IB/ehca: Replace vmalloc() with kmalloc() for queue allocation
+
+ To improve performance of driver resource allocation, replace
+ vmalloc() calls with kmalloc().
+
+ Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+index c3a3284..a260559 100644
+--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c
++++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+@@ -220,7 +220,7 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ queue->small_page = NULL;
+
+ /* allocate queue page pointers */
+- queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *));
++ queue->queue_pages = kmalloc(nr_of_pages * sizeof(void *), GFP_KERNEL);
+ if (!queue->queue_pages) {
+ ehca_gen_err("Couldn't allocate queue page list");
+ return 0;
+@@ -240,7 +240,7 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ ipz_queue_ctor_exit0:
+ ehca_gen_err("Couldn't alloc pages queue=%p "
+ "nr_of_pages=%x", queue, nr_of_pages);
+- vfree(queue->queue_pages);
++ kfree(queue->queue_pages);
+
+ return 0;
+ }
+@@ -262,7 +262,7 @@ int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+ free_page((unsigned long)queue->queue_pages[i]);
+ }
+
+- vfree(queue->queue_pages);
++ kfree(queue->queue_pages);
+
+ return 1;
+ }
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0020-vmalloc_for_big_allocation.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0020-vmalloc_for_big_allocation.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,56 @@
+commit c94f156f63c835ffc02b686f9d4238b106f31a5d
+Author: Stefan Roscher <ossrosch at linux.vnet.ibm.com>
+Date: Wed May 13 16:52:42 2009 -0700
+
+ IB/ehca: Fall back to vmalloc() for big allocations
+
+ In case of large queue pairs there is the possibillity of allocation
+ failures due to memory fragmentation when using kmalloc(). To ensure
+ the memory is allocated even if kmalloc() can not find chunks which
+ are big enough, we fall back to allocating the memory with vmalloc().
+
+ Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+index a260559..1227c59 100644
+--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c
++++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+@@ -222,8 +222,11 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ /* allocate queue page pointers */
+ queue->queue_pages = kmalloc(nr_of_pages * sizeof(void *), GFP_KERNEL);
+ if (!queue->queue_pages) {
+- ehca_gen_err("Couldn't allocate queue page list");
+- return 0;
++ queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *));
++ if (!queue->queue_pages) {
++ ehca_gen_err("Couldn't allocate queue page list");
++ return 0;
++ }
+ }
+ memset(queue->queue_pages, 0, nr_of_pages * sizeof(void *));
+
+@@ -240,7 +243,10 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ ipz_queue_ctor_exit0:
+ ehca_gen_err("Couldn't alloc pages queue=%p "
+ "nr_of_pages=%x", queue, nr_of_pages);
+- kfree(queue->queue_pages);
++ if (is_vmalloc_addr(queue->queue_pages))
++ vfree(queue->queue_pages);
++ else
++ kfree(queue->queue_pages);
+
+ return 0;
+ }
+@@ -262,7 +268,10 @@ int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+ free_page((unsigned long)queue->queue_pages[i]);
+ }
+
+- kfree(queue->queue_pages);
++ if (is_vmalloc_addr(queue->queue_pages))
++ vfree(queue->queue_pages);
++ else
++ kfree(queue->queue_pages);
+
+ return 1;
+ }
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0030-remove_allocation_for_user_qp.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0030-remove_allocation_for_user_qp.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,318 @@
+commit 1988d1fa1a9d642c5714a6afc9775fba0627f3ed
+Author: Stefan Roscher <ossrosch at linux.vnet.ibm.com>
+Date: Wed May 13 16:52:43 2009 -0700
+
+ IB/ehca: Remove unnecessary memory operations for userspace queue pairs
+
+ The queue map for flush completion circumvention is only used for
+ kernel space queue pairs. This patch skips the allocation of the
+ queue maps in case the QP is created for userspace. In addition, this
+ patch does not iomap the galpas for kernel usage if the queue pair is
+ only used in userspace. These changes will improve the performance of
+ creation of userspace queue pairs.
+
+ Signed-off-by: Stefan Roscher <stefan.roscher at de.ibm.com>
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
+index 00c1081..ead4e71 100644
+--- a/drivers/infiniband/hw/ehca/ehca_qp.c
++++ b/drivers/infiniband/hw/ehca/ehca_qp.c
+@@ -461,7 +461,7 @@ static struct ehca_qp *internal_create_qp(
+ ib_device);
+ struct ib_ucontext *context = NULL;
+ u64 h_ret;
+- int is_llqp = 0, has_srq = 0;
++ int is_llqp = 0, has_srq = 0, is_user = 0;
+ int qp_type, max_send_sge, max_recv_sge, ret;
+
+ /* h_call's out parameters */
+@@ -609,9 +609,6 @@ static struct ehca_qp *internal_create_qp(
+ }
+ }
+
+- if (pd->uobject && udata)
+- context = pd->uobject->context;
+-
+ my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+ if (!my_qp) {
+ ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+@@ -619,6 +616,11 @@ static struct ehca_qp *internal_create_qp(
+ return ERR_PTR(-ENOMEM);
+ }
+
++ if (pd->uobject && udata) {
++ is_user = 1;
++ context = pd->uobject->context;
++ }
++
+ atomic_set(&my_qp->nr_events, 0);
+ init_waitqueue_head(&my_qp->wait_completion);
+ spin_lock_init(&my_qp->spinlock_s);
+@@ -707,7 +709,7 @@ static struct ehca_qp *internal_create_qp(
+ (parms.squeue.is_small || parms.rqueue.is_small);
+ }
+
+- h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms);
++ h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+ h_ret);
+@@ -769,18 +771,20 @@ static struct ehca_qp *internal_create_qp(
+ goto create_qp_exit2;
+ }
+
+- my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+- my_qp->ipz_squeue.qe_size;
+- my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+- sizeof(struct ehca_qmap_entry));
+- if (!my_qp->sq_map.map) {
+- ehca_err(pd->device, "Couldn't allocate squeue "
+- "map ret=%i", ret);
+- goto create_qp_exit3;
++ if (!is_user) {
++ my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
++ my_qp->ipz_squeue.qe_size;
++ my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
++ sizeof(struct ehca_qmap_entry));
++ if (!my_qp->sq_map.map) {
++ ehca_err(pd->device, "Couldn't allocate squeue "
++ "map ret=%i", ret);
++ goto create_qp_exit3;
++ }
++ INIT_LIST_HEAD(&my_qp->sq_err_node);
++ /* to avoid the generation of bogus flush CQEs */
++ reset_queue_map(&my_qp->sq_map);
+ }
+- INIT_LIST_HEAD(&my_qp->sq_err_node);
+- /* to avoid the generation of bogus flush CQEs */
+- reset_queue_map(&my_qp->sq_map);
+ }
+
+ if (HAS_RQ(my_qp)) {
+@@ -792,20 +796,21 @@ static struct ehca_qp *internal_create_qp(
+ "and pages ret=%i", ret);
+ goto create_qp_exit4;
+ }
+-
+- my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+- my_qp->ipz_rqueue.qe_size;
+- my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+- sizeof(struct ehca_qmap_entry));
+- if (!my_qp->rq_map.map) {
+- ehca_err(pd->device, "Couldn't allocate squeue "
+- "map ret=%i", ret);
+- goto create_qp_exit5;
++ if (!is_user) {
++ my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
++ my_qp->ipz_rqueue.qe_size;
++ my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
++ sizeof(struct ehca_qmap_entry));
++ if (!my_qp->rq_map.map) {
++ ehca_err(pd->device, "Couldn't allocate squeue "
++ "map ret=%i", ret);
++ goto create_qp_exit5;
++ }
++ INIT_LIST_HEAD(&my_qp->rq_err_node);
++ /* to avoid the generation of bogus flush CQEs */
++ reset_queue_map(&my_qp->rq_map);
+ }
+- INIT_LIST_HEAD(&my_qp->rq_err_node);
+- /* to avoid the generation of bogus flush CQEs */
+- reset_queue_map(&my_qp->rq_map);
+- } else if (init_attr->srq) {
++ } else if (init_attr->srq && !is_user) {
+ /* this is a base QP, use the queue map of the SRQ */
+ my_qp->rq_map = my_srq->rq_map;
+ INIT_LIST_HEAD(&my_qp->rq_err_node);
+@@ -918,7 +923,7 @@ create_qp_exit7:
+ kfree(my_qp->mod_qp_parm);
+
+ create_qp_exit6:
+- if (HAS_RQ(my_qp))
++ if (HAS_RQ(my_qp) && !is_user)
+ vfree(my_qp->rq_map.map);
+
+ create_qp_exit5:
+@@ -926,7 +931,7 @@ create_qp_exit5:
+ ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+
+ create_qp_exit4:
+- if (HAS_SQ(my_qp))
++ if (HAS_SQ(my_qp) && !is_user)
+ vfree(my_qp->sq_map.map);
+
+ create_qp_exit3:
+@@ -1244,6 +1249,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ u64 update_mask;
+ u64 h_ret;
+ int bad_wqe_cnt = 0;
++ int is_user = 0;
+ int squeue_locked = 0;
+ unsigned long flags = 0;
+
+@@ -1266,6 +1272,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ ret = ehca2ib_return_code(h_ret);
+ goto modify_qp_exit1;
+ }
++ if (ibqp->uobject)
++ is_user = 1;
+
+ qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+
+@@ -1728,7 +1736,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ goto modify_qp_exit2;
+ }
+ }
+- if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)) {
++ if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
++ && !is_user) {
+ ret = check_for_left_cqes(my_qp, shca);
+ if (ret)
+ goto modify_qp_exit2;
+@@ -1738,16 +1747,17 @@ static int internal_modify_qp(struct ib_qp *ibqp,
+ ipz_qeit_reset(&my_qp->ipz_rqueue);
+ ipz_qeit_reset(&my_qp->ipz_squeue);
+
+- if (qp_cur_state == IB_QPS_ERR) {
++ if (qp_cur_state == IB_QPS_ERR && !is_user) {
+ del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+ if (HAS_RQ(my_qp))
+ del_from_err_list(my_qp->recv_cq,
+ &my_qp->rq_err_node);
+ }
+- reset_queue_map(&my_qp->sq_map);
++ if (!is_user)
++ reset_queue_map(&my_qp->sq_map);
+
+- if (HAS_RQ(my_qp))
++ if (HAS_RQ(my_qp) && !is_user)
+ reset_queue_map(&my_qp->rq_map);
+ }
+
+@@ -2138,10 +2148,12 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ int ret;
+ u64 h_ret;
+ u8 port_num;
++ int is_user = 0;
+ enum ib_qp_type qp_type;
+ unsigned long flags;
+
+ if (uobject) {
++ is_user = 1;
+ if (my_qp->mm_count_galpa ||
+ my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+ ehca_err(dev, "Resources still referenced in "
+@@ -2168,10 +2180,10 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ * SRQs will never get into an error list and do not have a recv_cq,
+ * so we need to skip them here.
+ */
+- if (HAS_RQ(my_qp) && !IS_SRQ(my_qp))
++ if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+ del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+
+- if (HAS_SQ(my_qp))
++ if (HAS_SQ(my_qp) && !is_user)
+ del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+ /* now wait until all pending events have completed */
+@@ -2209,13 +2221,13 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+
+ if (HAS_RQ(my_qp)) {
+ ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+-
+- vfree(my_qp->rq_map.map);
++ if (!is_user)
++ vfree(my_qp->rq_map.map);
+ }
+ if (HAS_SQ(my_qp)) {
+ ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+-
+- vfree(my_qp->sq_map.map);
++ if (!is_user)
++ vfree(my_qp->sq_map.map);
+ }
+ kmem_cache_free(qp_cache, my_qp);
+ atomic_dec(&shca->num_qps);
+diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
+index d0ab0c0..4d5dc33 100644
+--- a/drivers/infiniband/hw/ehca/hcp_if.c
++++ b/drivers/infiniband/hw/ehca/hcp_if.c
+@@ -284,7 +284,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ param->act_pages = (u32)outs[4];
+
+ if (ret == H_SUCCESS)
+- hcp_galpas_ctor(&cq->galpas, outs[5], outs[6]);
++ hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Not enough resources. ret=%lli", ret);
+@@ -293,7 +293,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ }
+
+ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+- struct ehca_alloc_qp_parms *parms)
++ struct ehca_alloc_qp_parms *parms, int is_user)
+ {
+ u64 ret;
+ u64 allocate_controls, max_r10_reg, r11, r12;
+@@ -359,7 +359,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+ (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+
+ if (ret == H_SUCCESS)
+- hcp_galpas_ctor(&parms->galpas, outs[6], outs[6]);
++ hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Not enough resources. ret=%lli", ret);
+diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
+index 2c3c6e0..39c1c36 100644
+--- a/drivers/infiniband/hw/ehca/hcp_if.h
++++ b/drivers/infiniband/hw/ehca/hcp_if.h
+@@ -78,7 +78,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ * initialize resources, create empty QPPTs (2 rings).
+ */
+ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+- struct ehca_alloc_qp_parms *parms);
++ struct ehca_alloc_qp_parms *parms, int is_user);
+
+ u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+ const u8 port_id,
+diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c
+index 2148210..b3e0e72 100644
+--- a/drivers/infiniband/hw/ehca/hcp_phyp.c
++++ b/drivers/infiniband/hw/ehca/hcp_phyp.c
+@@ -54,12 +54,15 @@ int hcall_unmap_page(u64 mapaddr)
+ return 0;
+ }
+
+-int hcp_galpas_ctor(struct h_galpas *galpas,
++int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+ u64 paddr_kernel, u64 paddr_user)
+ {
+- int ret = hcall_map_page(paddr_kernel, &galpas->kernel.fw_handle);
+- if (ret)
+- return ret;
++ if (!is_user) {
++ int ret = hcall_map_page(paddr_kernel, &galpas->kernel.fw_handle);
++ if (ret)
++ return ret;
++ } else
++ galpas->kernel.fw_handle = 0;
+
+ galpas->user.fw_handle = paddr_user;
+
+diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h
+index 5305c2a..204227d 100644
+--- a/drivers/infiniband/hw/ehca/hcp_phyp.h
++++ b/drivers/infiniband/hw/ehca/hcp_phyp.h
+@@ -78,7 +78,7 @@ static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+ *(volatile u64 __force *)addr = value;
+ }
+
+-int hcp_galpas_ctor(struct h_galpas *galpas,
++int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+ u64 paddr_kernel, u64 paddr_user);
+
+ int hcp_galpas_dtor(struct h_galpas *galpas);
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0040-remove_bitmask_macros.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0040-remove_bitmask_macros.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,132 @@
+commit 25a52393270ca48c7d0848672ad4423313033c3d
+Author: Joachim Fenkes <fenkes at de.ibm.com>
+Date: Wed Jun 3 13:25:42 2009 -0700
+
+ IB/ehca: Remove superfluous bitmasks from QP control block
+
+ All the fields in the control block are nicely right-aligned, so no
+ masking is necessary.
+
+ Signed-off-by: Joachim Fenkes <fenkes at de.ibm.com>
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
+index 1798e64..689c357 100644
+--- a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
++++ b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
+@@ -165,7 +165,6 @@ struct hcp_modify_qp_control_block {
+ #define MQPCB_MASK_ALT_P_KEY_IDX EHCA_BMASK_IBM( 7, 7)
+ #define MQPCB_MASK_RDMA_ATOMIC_CTRL EHCA_BMASK_IBM( 8, 8)
+ #define MQPCB_MASK_QP_STATE EHCA_BMASK_IBM( 9, 9)
+-#define MQPCB_QP_STATE EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES EHCA_BMASK_IBM(11, 11)
+ #define MQPCB_MASK_PATH_MIGRATION_STATE EHCA_BMASK_IBM(12, 12)
+ #define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP EHCA_BMASK_IBM(13, 13)
+@@ -176,60 +175,33 @@ struct hcp_modify_qp_control_block {
+ #define MQPCB_MASK_RETRY_COUNT EHCA_BMASK_IBM(18, 18)
+ #define MQPCB_MASK_TIMEOUT EHCA_BMASK_IBM(19, 19)
+ #define MQPCB_MASK_PATH_MTU EHCA_BMASK_IBM(20, 20)
+-#define MQPCB_PATH_MTU EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_MAX_STATIC_RATE EHCA_BMASK_IBM(21, 21)
+-#define MQPCB_MAX_STATIC_RATE EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_DLID EHCA_BMASK_IBM(22, 22)
+-#define MQPCB_DLID EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_RNR_RETRY_COUNT EHCA_BMASK_IBM(23, 23)
+-#define MQPCB_RNR_RETRY_COUNT EHCA_BMASK_IBM(29, 31)
+ #define MQPCB_MASK_SOURCE_PATH_BITS EHCA_BMASK_IBM(24, 24)
+-#define MQPCB_SOURCE_PATH_BITS EHCA_BMASK_IBM(25, 31)
+ #define MQPCB_MASK_TRAFFIC_CLASS EHCA_BMASK_IBM(25, 25)
+-#define MQPCB_TRAFFIC_CLASS EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_HOP_LIMIT EHCA_BMASK_IBM(26, 26)
+-#define MQPCB_HOP_LIMIT EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_SOURCE_GID_IDX EHCA_BMASK_IBM(27, 27)
+-#define MQPCB_SOURCE_GID_IDX EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_FLOW_LABEL EHCA_BMASK_IBM(28, 28)
+-#define MQPCB_FLOW_LABEL EHCA_BMASK_IBM(12, 31)
+ #define MQPCB_MASK_DEST_GID EHCA_BMASK_IBM(30, 30)
+ #define MQPCB_MASK_SERVICE_LEVEL_AL EHCA_BMASK_IBM(31, 31)
+-#define MQPCB_SERVICE_LEVEL_AL EHCA_BMASK_IBM(28, 31)
+ #define MQPCB_MASK_SEND_GRH_FLAG_AL EHCA_BMASK_IBM(32, 32)
+-#define MQPCB_SEND_GRH_FLAG_AL EHCA_BMASK_IBM(31, 31)
+ #define MQPCB_MASK_RETRY_COUNT_AL EHCA_BMASK_IBM(33, 33)
+-#define MQPCB_RETRY_COUNT_AL EHCA_BMASK_IBM(29, 31)
+ #define MQPCB_MASK_TIMEOUT_AL EHCA_BMASK_IBM(34, 34)
+-#define MQPCB_TIMEOUT_AL EHCA_BMASK_IBM(27, 31)
+ #define MQPCB_MASK_MAX_STATIC_RATE_AL EHCA_BMASK_IBM(35, 35)
+-#define MQPCB_MAX_STATIC_RATE_AL EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_DLID_AL EHCA_BMASK_IBM(36, 36)
+-#define MQPCB_DLID_AL EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_RNR_RETRY_COUNT_AL EHCA_BMASK_IBM(37, 37)
+-#define MQPCB_RNR_RETRY_COUNT_AL EHCA_BMASK_IBM(29, 31)
+ #define MQPCB_MASK_SOURCE_PATH_BITS_AL EHCA_BMASK_IBM(38, 38)
+-#define MQPCB_SOURCE_PATH_BITS_AL EHCA_BMASK_IBM(25, 31)
+ #define MQPCB_MASK_TRAFFIC_CLASS_AL EHCA_BMASK_IBM(39, 39)
+-#define MQPCB_TRAFFIC_CLASS_AL EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_HOP_LIMIT_AL EHCA_BMASK_IBM(40, 40)
+-#define MQPCB_HOP_LIMIT_AL EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_SOURCE_GID_IDX_AL EHCA_BMASK_IBM(41, 41)
+-#define MQPCB_SOURCE_GID_IDX_AL EHCA_BMASK_IBM(24, 31)
+ #define MQPCB_MASK_FLOW_LABEL_AL EHCA_BMASK_IBM(42, 42)
+-#define MQPCB_FLOW_LABEL_AL EHCA_BMASK_IBM(12, 31)
+ #define MQPCB_MASK_DEST_GID_AL EHCA_BMASK_IBM(44, 44)
+ #define MQPCB_MASK_MAX_NR_OUTST_SEND_WR EHCA_BMASK_IBM(45, 45)
+-#define MQPCB_MAX_NR_OUTST_SEND_WR EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_MAX_NR_OUTST_RECV_WR EHCA_BMASK_IBM(46, 46)
+-#define MQPCB_MAX_NR_OUTST_RECV_WR EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK EHCA_BMASK_IBM(47, 47)
+-#define MQPCB_DISABLE_ETE_CREDIT_CHECK EHCA_BMASK_IBM(31, 31)
+-#define MQPCB_QP_NUMBER EHCA_BMASK_IBM( 8, 31)
+ #define MQPCB_MASK_QP_ENABLE EHCA_BMASK_IBM(48, 48)
+-#define MQPCB_QP_ENABLE EHCA_BMASK_IBM(31, 31)
+ #define MQPCB_MASK_CURR_SRQ_LIMIT EHCA_BMASK_IBM(49, 49)
+-#define MQPCB_CURR_SRQ_LIMIT EHCA_BMASK_IBM(16, 31)
+ #define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG EHCA_BMASK_IBM(50, 50)
+ #define MQPCB_MASK_SHARED_RQ_HNDL EHCA_BMASK_IBM(51, 51)
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
+index ead4e71..0338f1f 100644
+--- a/drivers/infiniband/hw/ehca/ehca_qp.c
++++ b/drivers/infiniband/hw/ehca/ehca_qp.c
+@@ -1962,19 +1962,13 @@ int ehca_query_qp(struct ib_qp *qp,
+ qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+ qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+
+- qp_attr->pkey_index =
+- EHCA_BMASK_GET(MQPCB_PRIM_P_KEY_IDX, qpcb->prim_p_key_idx);
+-
+- qp_attr->port_num =
+- EHCA_BMASK_GET(MQPCB_PRIM_PHYS_PORT, qpcb->prim_phys_port);
+-
++ qp_attr->pkey_index = qpcb->prim_p_key_idx;
++ qp_attr->port_num = qpcb->prim_phys_port;
+ qp_attr->timeout = qpcb->timeout;
+ qp_attr->retry_cnt = qpcb->retry_count;
+ qp_attr->rnr_retry = qpcb->rnr_retry_count;
+
+- qp_attr->alt_pkey_index =
+- EHCA_BMASK_GET(MQPCB_PRIM_P_KEY_IDX, qpcb->alt_p_key_idx);
+-
++ qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+ qp_attr->alt_port_num = qpcb->alt_phys_port;
+ qp_attr->alt_timeout = qpcb->timeout_al;
+
+@@ -2061,8 +2055,7 @@ int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+- mqpcb->curr_srq_limit =
+- EHCA_BMASK_SET(MQPCB_CURR_SRQ_LIMIT, attr->srq_limit);
++ mqpcb->curr_srq_limit = attr->srq_limit;
+ mqpcb->qp_aff_asyn_ev_log_reg =
+ EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+ }
+@@ -2125,8 +2118,7 @@ int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+
+ srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+ srq_attr->max_sge = 3;
+- srq_attr->srq_limit = EHCA_BMASK_GET(
+- MQPCB_CURR_SRQ_LIMIT, qpcb->curr_srq_limit);
++ srq_attr->srq_limit = qpcb->curr_srq_limit;
+
+ if (ehca_debug_level >= 2)
+ ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0050-dmem_toleration.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0050-dmem_toleration.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,745 @@
+commit 0cf89dcdbc53f2b43e4ce7419b6ff47f4309c2eb
+Author: Hannes Hering <hering2 at de.ibm.com>
+Date: Mon Jun 22 22:18:51 2009 -0700
+
+ IB/ehca: Tolerate dynamic memory operations before driver load
+
+ Implement toleration of dynamic memory operations and 16 GB gigantic
+ pages, where "toleration" means that the driver can cope with dynamic
+ memory operations that happen before the driver is loaded. While the
+ ehca driver is loaded, dynamic memory operations are still prohibited
+ by returning NOTIFY_BAD from the memory notifier.
+
+ On module load the driver walks through available system memory,
+ checks for available memory ranges and then registers the kernel
+ internal memory region accordingly. The translation of address ranges
+ is implemented via a 3-level busmap.
+
+ Signed-off-by: Hannes Hering <hering2 at de.ibm.com>
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
+index ce4e6ef..14a18b7 100644
+--- a/drivers/infiniband/hw/ehca/ehca_main.c
++++ b/drivers/infiniband/hw/ehca/ehca_main.c
+@@ -506,6 +506,7 @@ static int ehca_init_device(struct ehca_shca *shca)
+ shca->ib_device.detach_mcast = ehca_detach_mcast;
+ shca->ib_device.process_mad = ehca_process_mad;
+ shca->ib_device.mmap = ehca_mmap;
++ shca->ib_device.dma_ops = &ehca_dma_mapping_ops;
+
+ if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+ shca->ib_device.uverbs_cmd_mask |=
+@@ -1028,17 +1029,23 @@ static int __init ehca_module_init(void)
+ goto module_init1;
+ }
+
++ ret = ehca_create_busmap();
++ if (ret) {
++ ehca_gen_err("Cannot create busmap.");
++ goto module_init2;
++ }
++
+ ret = ibmebus_register_driver(&ehca_driver);
+ if (ret) {
+ ehca_gen_err("Cannot register eHCA device driver");
+ ret = -EINVAL;
+- goto module_init2;
++ goto module_init3;
+ }
+
+ ret = register_memory_notifier(&ehca_mem_nb);
+ if (ret) {
+ ehca_gen_err("Failed registering memory add/remove notifier");
+- goto module_init3;
++ goto module_init4;
+ }
+
+ if (ehca_poll_all_eqs != 1) {
+@@ -1053,9 +1060,12 @@ static int __init ehca_module_init(void)
+
+ return 0;
+
+-module_init3:
++module_init4:
+ ibmebus_unregister_driver(&ehca_driver);
+
++module_init3:
++ ehca_destroy_busmap();
++
+ module_init2:
+ ehca_destroy_slab_caches();
+
+@@ -1073,6 +1083,8 @@ static void __exit ehca_module_exit(void)
+
+ unregister_memory_notifier(&ehca_mem_nb);
+
++ ehca_destroy_busmap();
++
+ ehca_destroy_slab_caches();
+
+ ehca_destroy_comp_pool();
+diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
+index 72f83f7..7663a2a 100644
+--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
++++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
+@@ -53,6 +53,38 @@
+ /* max number of rpages (per hcall register_rpages) */
+ #define MAX_RPAGES 512
+
++/* DMEM toleration management */
++#define EHCA_SECTSHIFT SECTION_SIZE_BITS
++#define EHCA_SECTSIZE (1UL << EHCA_SECTSHIFT)
++#define EHCA_HUGEPAGESHIFT 34
++#define EHCA_HUGEPAGE_SIZE (1UL << EHCA_HUGEPAGESHIFT)
++#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
++#define EHCA_INVAL_ADDR 0xFFFFFFFFFFFFFFFFULL
++#define EHCA_DIR_INDEX_SHIFT 13 /* 8k Entries in 64k block */
++#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
++#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
++#define EHCA_TOP_MAP_SIZE (0x10000) /* currently fixed map size */
++#define EHCA_DIR_MAP_SIZE (0x10000)
++#define EHCA_ENT_MAP_SIZE (0x10000)
++#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
++
++static unsigned long ehca_mr_len;
++
++/*
++ * Memory map data structures
++ */
++struct ehca_dir_bmap {
++ u64 ent[EHCA_MAP_ENTRIES];
++};
++struct ehca_top_bmap {
++ struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
++};
++struct ehca_bmap {
++ struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
++};
++
++static struct ehca_bmap *ehca_bmap;
++
+ static struct kmem_cache *mr_cache;
+ static struct kmem_cache *mw_cache;
+
+@@ -68,6 +100,8 @@ enum ehca_mr_pgsize {
+ #define EHCA_MR_PGSHIFT1M 20
+ #define EHCA_MR_PGSHIFT16M 24
+
++static u64 ehca_map_vaddr(void *caddr);
++
+ static u32 ehca_encode_hwpage_size(u32 pgsize)
+ {
+ int log = ilog2(pgsize);
+@@ -135,7 +169,8 @@ struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+ goto get_dma_mr_exit0;
+ }
+
+- ret = ehca_reg_maxmr(shca, e_maxmr, (u64 *)KERNELBASE,
++ ret = ehca_reg_maxmr(shca, e_maxmr,
++ (void *)ehca_map_vaddr((void *)KERNELBASE),
+ mr_access_flags, e_pd,
+ &e_maxmr->ib.ib_mr.lkey,
+ &e_maxmr->ib.ib_mr.rkey);
+@@ -251,7 +286,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+
+ ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+ e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+- &e_mr->ib.ib_mr.rkey);
++ &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+ if (ret) {
+ ib_mr = ERR_PTR(ret);
+ goto reg_phys_mr_exit1;
+@@ -370,7 +405,7 @@ reg_user_mr_fallback:
+
+ ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+ e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+- &e_mr->ib.ib_mr.rkey);
++ &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+ if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+ ehca_warn(pd->device, "failed to register mr "
+ "with hwpage_size=%llx", hwpage_size);
+@@ -794,7 +829,7 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+ ret = ehca_reg_mr(shca, e_fmr, NULL,
+ fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+ mr_access_flags, e_pd, &pginfo,
+- &tmp_lkey, &tmp_rkey);
++ &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+ if (ret) {
+ ib_fmr = ERR_PTR(ret);
+ goto alloc_fmr_exit1;
+@@ -983,6 +1018,10 @@ free_fmr_exit0:
+
+ /*----------------------------------------------------------------------*/
+
++static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
++ struct ehca_mr *e_mr,
++ struct ehca_mr_pginfo *pginfo);
++
+ int ehca_reg_mr(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+@@ -991,7 +1030,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey, /*OUT*/
+- u32 *rkey) /*OUT*/
++ u32 *rkey, /*OUT*/
++ enum ehca_reg_type reg_type)
+ {
+ int ret;
+ u64 h_ret;
+@@ -1015,7 +1055,13 @@ int ehca_reg_mr(struct ehca_shca *shca,
+
+ e_mr->ipz_mr_handle = hipzout.handle;
+
+- ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
++ if (reg_type == EHCA_REG_BUSMAP_MR)
++ ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
++ else if (reg_type == EHCA_REG_MR)
++ ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
++ else
++ ret = -EINVAL;
++
+ if (ret)
+ goto ehca_reg_mr_exit1;
+
+@@ -1316,7 +1362,7 @@ int ehca_rereg_mr(struct ehca_shca *shca,
+ e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+
+ ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+- e_pd, pginfo, lkey, rkey);
++ e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+ if (ret) {
+ u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+ memcpy(&e_mr->flags, &(save_mr.flags),
+@@ -1409,7 +1455,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca,
+ ret = ehca_reg_mr(shca, e_fmr, NULL,
+ (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+ e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+- &tmp_rkey);
++ &tmp_rkey, EHCA_REG_MR);
+ if (ret) {
+ u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+ memcpy(&e_fmr->flags, &(save_mr.flags),
+@@ -1478,6 +1524,90 @@ ehca_reg_smr_exit0:
+ } /* end ehca_reg_smr() */
+
+ /*----------------------------------------------------------------------*/
++static inline void *ehca_calc_sectbase(int top, int dir, int idx)
++{
++ unsigned long ret = idx;
++ ret |= dir << EHCA_DIR_INDEX_SHIFT;
++ ret |= top << EHCA_TOP_INDEX_SHIFT;
++ return abs_to_virt(ret << SECTION_SIZE_BITS);
++}
++
++#define ehca_bmap_valid(entry) \
++ ((u64)entry != (u64)EHCA_INVAL_ADDR)
++
++static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
++ struct ehca_shca *shca, struct ehca_mr *mr,
++ struct ehca_mr_pginfo *pginfo)
++{
++ u64 h_ret = 0;
++ unsigned long page = 0;
++ u64 rpage = virt_to_abs(kpage);
++ int page_count;
++
++ void *sectbase = ehca_calc_sectbase(top, dir, idx);
++ if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
++ ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
++ "hwpage_size does not fit to "
++ "section start address");
++ }
++ page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
++
++ while (page < page_count) {
++ u64 rnum;
++ for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
++ rnum++) {
++ void *pg = sectbase + ((page++) * pginfo->hwpage_size);
++ kpage[rnum] = virt_to_abs(pg);
++ }
++
++ h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
++ ehca_encode_hwpage_size(pginfo->hwpage_size),
++ 0, rpage, rnum);
++
++ if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
++ ehca_err(&shca->ib_device, "register_rpage_mr failed");
++ return h_ret;
++ }
++ }
++ return h_ret;
++}
++
++static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
++ struct ehca_shca *shca, struct ehca_mr *mr,
++ struct ehca_mr_pginfo *pginfo)
++{
++ u64 hret = H_SUCCESS;
++ int idx;
++
++ for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
++ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
++ continue;
++
++ hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
++ pginfo);
++ if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
++ return hret;
++ }
++ return hret;
++}
++
++static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
++ struct ehca_mr *mr,
++ struct ehca_mr_pginfo *pginfo)
++{
++ u64 hret = H_SUCCESS;
++ int dir;
++
++ for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
++ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
++ continue;
++
++ hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
++ if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
++ return hret;
++ }
++ return hret;
++}
+
+ /* register internal max-MR to internal SHCA */
+ int ehca_reg_internal_maxmr(
+@@ -1495,6 +1625,11 @@ int ehca_reg_internal_maxmr(
+ u32 num_hwpages;
+ u64 hw_pgsize;
+
++ if (!ehca_bmap) {
++ ret = -EFAULT;
++ goto ehca_reg_internal_maxmr_exit0;
++ }
++
+ e_mr = ehca_mr_new();
+ if (!e_mr) {
+ ehca_err(&shca->ib_device, "out of memory");
+@@ -1504,8 +1639,8 @@ int ehca_reg_internal_maxmr(
+ e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+
+ /* register internal max-MR on HCA */
+- size_maxmr = (u64)high_memory - PAGE_OFFSET;
+- iova_start = (u64 *)KERNELBASE;
++ size_maxmr = ehca_mr_len;
++ iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE);
+ ib_pbuf.addr = 0;
+ ib_pbuf.size = size_maxmr;
+ num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+@@ -1524,7 +1659,7 @@ int ehca_reg_internal_maxmr(
+
+ ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+ &pginfo, &e_mr->ib.ib_mr.lkey,
+- &e_mr->ib.ib_mr.rkey);
++ &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+ if (ret) {
+ ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+ "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+@@ -2077,8 +2212,8 @@ int ehca_mr_is_maxmr(u64 size,
+ u64 *iova_start)
+ {
+ /* a MR is treated as max-MR only if it fits following: */
+- if ((size == ((u64)high_memory - PAGE_OFFSET)) &&
+- (iova_start == (void *)KERNELBASE)) {
++ if ((size == ehca_mr_len) &&
++ (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) {
+ ehca_gen_dbg("this is a max-MR");
+ return 1;
+ } else
+@@ -2184,3 +2319,350 @@ void ehca_cleanup_mrmw_cache(void)
+ if (mw_cache)
+ kmem_cache_destroy(mw_cache);
+ }
++
++static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
++ int dir)
++{
++ if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
++ ehca_top_bmap->dir[dir] =
++ kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
++ if (!ehca_top_bmap->dir[dir])
++ return -ENOMEM;
++ /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
++ memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
++ }
++ return 0;
++}
++
++static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
++{
++ if (!ehca_bmap_valid(ehca_bmap->top[top])) {
++ ehca_bmap->top[top] =
++ kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
++ if (!ehca_bmap->top[top])
++ return -ENOMEM;
++ /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
++ memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
++ }
++ return ehca_init_top_bmap(ehca_bmap->top[top], dir);
++}
++
++static inline int ehca_calc_index(unsigned long i, unsigned long s)
++{
++ return (i >> s) & EHCA_INDEX_MASK;
++}
++
++void ehca_destroy_busmap(void)
++{
++ int top, dir;
++
++ if (!ehca_bmap)
++ return;
++
++ for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
++ if (!ehca_bmap_valid(ehca_bmap->top[top]))
++ continue;
++ for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
++ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
++ continue;
++
++ kfree(ehca_bmap->top[top]->dir[dir]);
++ }
++
++ kfree(ehca_bmap->top[top]);
++ }
++
++ kfree(ehca_bmap);
++ ehca_bmap = NULL;
++}
++
++static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
++{
++ unsigned long i, start_section, end_section;
++ int top, dir, idx;
++
++ if (!nr_pages)
++ return 0;
++
++ if (!ehca_bmap) {
++ ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
++ if (!ehca_bmap)
++ return -ENOMEM;
++ /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
++ memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
++ }
++
++ start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
++ end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
++ for (i = start_section; i < end_section; i++) {
++ int ret;
++ top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
++ dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
++ idx = i & EHCA_INDEX_MASK;
++
++ ret = ehca_init_bmap(ehca_bmap, top, dir);
++ if (ret) {
++ ehca_destroy_busmap();
++ return ret;
++ }
++ ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
++ ehca_mr_len += EHCA_SECTSIZE;
++ }
++ return 0;
++}
++
++static int ehca_is_hugepage(unsigned long pfn)
++{
++ int page_order;
++
++ if (pfn & EHCA_HUGEPAGE_PFN_MASK)
++ return 0;
++
++ page_order = compound_order(pfn_to_page(pfn));
++ if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
++ return 0;
++
++ return 1;
++}
++
++static int ehca_create_busmap_callback(unsigned long initial_pfn,
++ unsigned long total_nr_pages, void *arg)
++{
++ int ret;
++ unsigned long pfn, start_pfn, end_pfn, nr_pages;
++
++ if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
++ return ehca_update_busmap(initial_pfn, total_nr_pages);
++
++ /* Given chunk is >= 16GB -> check for hugepages */
++ start_pfn = initial_pfn;
++ end_pfn = initial_pfn + total_nr_pages;
++ pfn = start_pfn;
++
++ while (pfn < end_pfn) {
++ if (ehca_is_hugepage(pfn)) {
++ /* Add mem found in front of the hugepage */
++ nr_pages = pfn - start_pfn;
++ ret = ehca_update_busmap(start_pfn, nr_pages);
++ if (ret)
++ return ret;
++ /* Skip the hugepage */
++ pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
++ start_pfn = pfn;
++ } else
++ pfn += (EHCA_SECTSIZE / PAGE_SIZE);
++ }
++
++ /* Add mem found behind the hugepage(s) */
++ nr_pages = pfn - start_pfn;
++ return ehca_update_busmap(start_pfn, nr_pages);
++}
++
++int ehca_create_busmap(void)
++{
++ int ret;
++
++ ehca_mr_len = 0;
++ ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
++ ehca_create_busmap_callback);
++ return ret;
++}
++
++static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
++ struct ehca_mr *e_mr,
++ struct ehca_mr_pginfo *pginfo)
++{
++ int top;
++ u64 hret, *kpage;
++
++ kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
++ if (!kpage) {
++ ehca_err(&shca->ib_device, "kpage alloc failed");
++ return -ENOMEM;
++ }
++ for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
++ if (!ehca_bmap_valid(ehca_bmap->top[top]))
++ continue;
++ hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
++ if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
++ break;
++ }
++
++ ehca_free_fw_ctrlblock(kpage);
++
++ if (hret == H_SUCCESS)
++ return 0; /* Everything is fine */
++ else {
++ ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
++ "h_ret=%lli e_mr=%p top=%x lkey=%x "
++ "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
++ e_mr->ib.ib_mr.lkey,
++ shca->ipz_hca_handle.handle,
++ e_mr->ipz_mr_handle.handle);
++ return ehca2ib_return_code(hret);
++ }
++}
++
++static u64 ehca_map_vaddr(void *caddr)
++{
++ int top, dir, idx;
++ unsigned long abs_addr, offset;
++ u64 entry;
++
++ if (!ehca_bmap)
++ return EHCA_INVAL_ADDR;
++
++ abs_addr = virt_to_abs(caddr);
++ top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
++ if (!ehca_bmap_valid(ehca_bmap->top[top]))
++ return EHCA_INVAL_ADDR;
++
++ dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
++ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
++ return EHCA_INVAL_ADDR;
++
++ idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
++
++ entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
++ if (ehca_bmap_valid(entry)) {
++ offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
++ return entry | offset;
++ } else
++ return EHCA_INVAL_ADDR;
++}
++
++static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
++{
++ return dma_addr == EHCA_INVAL_ADDR;
++}
++
++static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
++ size_t size, enum dma_data_direction direction)
++{
++ if (cpu_addr)
++ return ehca_map_vaddr(cpu_addr);
++ else
++ return EHCA_INVAL_ADDR;
++}
++
++static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
++ enum dma_data_direction direction)
++{
++ /* This is only a stub; nothing to be done here */
++}
++
++static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
++ unsigned long offset, size_t size,
++ enum dma_data_direction direction)
++{
++ u64 addr;
++
++ if (offset + size > PAGE_SIZE)
++ return EHCA_INVAL_ADDR;
++
++ addr = ehca_map_vaddr(page_address(page));
++ if (!ehca_dma_mapping_error(dev, addr))
++ addr += offset;
++
++ return addr;
++}
++
++static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
++ enum dma_data_direction direction)
++{
++ /* This is only a stub; nothing to be done here */
++}
++
++static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
++ int nents, enum dma_data_direction direction)
++{
++ struct scatterlist *sg;
++ int i;
++
++ for_each_sg(sgl, sg, nents, i) {
++ u64 addr;
++ addr = ehca_map_vaddr(sg_virt(sg));
++ if (ehca_dma_mapping_error(dev, addr))
++ return 0;
++
++ sg->dma_address = addr;
++ sg->dma_length = sg->length;
++ }
++ return nents;
++}
++
++static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
++ int nents, enum dma_data_direction direction)
++{
++ /* This is only a stub; nothing to be done here */
++}
++
++static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg)
++{
++ return sg->dma_address;
++}
++
++static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg)
++{
++ return sg->length;
++}
++
++static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
++ size_t size,
++ enum dma_data_direction dir)
++{
++ dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
++}
++
++static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
++ size_t size,
++ enum dma_data_direction dir)
++{
++ dma_sync_single_for_device(dev->dma_device, addr, size, dir);
++}
++
++static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
++ u64 *dma_handle, gfp_t flag)
++{
++ struct page *p;
++ void *addr = NULL;
++ u64 dma_addr;
++
++ p = alloc_pages(flag, get_order(size));
++ if (p) {
++ addr = page_address(p);
++ dma_addr = ehca_map_vaddr(addr);
++ if (ehca_dma_mapping_error(dev, dma_addr)) {
++ free_pages((unsigned long)addr, get_order(size));
++ return NULL;
++ }
++ if (dma_handle)
++ *dma_handle = dma_addr;
++ return addr;
++ }
++ return NULL;
++}
++
++static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
++ void *cpu_addr, u64 dma_handle)
++{
++ if (cpu_addr && size)
++ free_pages((unsigned long)cpu_addr, get_order(size));
++}
++
++
++struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
++ .mapping_error = ehca_dma_mapping_error,
++ .map_single = ehca_dma_map_single,
++ .unmap_single = ehca_dma_unmap_single,
++ .map_page = ehca_dma_map_page,
++ .unmap_page = ehca_dma_unmap_page,
++ .map_sg = ehca_dma_map_sg,
++ .unmap_sg = ehca_dma_unmap_sg,
++ .dma_address = ehca_dma_address,
++ .dma_len = ehca_dma_len,
++ .sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
++ .sync_single_for_device = ehca_dma_sync_single_for_device,
++ .alloc_coherent = ehca_dma_alloc_coherent,
++ .free_coherent = ehca_dma_free_coherent,
++};
+diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h
+index bc8f4e3..50d8b51 100644
+--- a/drivers/infiniband/hw/ehca/ehca_mrmw.h
++++ b/drivers/infiniband/hw/ehca/ehca_mrmw.h
+@@ -42,6 +42,11 @@
+ #ifndef _EHCA_MRMW_H_
+ #define _EHCA_MRMW_H_
+
++enum ehca_reg_type {
++ EHCA_REG_MR,
++ EHCA_REG_BUSMAP_MR
++};
++
+ int ehca_reg_mr(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+@@ -50,7 +55,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey,
+- u32 *rkey);
++ u32 *rkey,
++ enum ehca_reg_type reg_type);
+
+ int ehca_reg_mr_rpages(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+@@ -118,4 +124,9 @@ void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+
+ void ehca_mr_deletenew(struct ehca_mr *mr);
+
++int ehca_create_busmap(void);
++
++void ehca_destroy_busmap(void);
++
++extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+ #endif /*_EHCA_MRMW_H_*/
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0060-bump_version.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0060-bump_version.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,24 @@
+commit 1d4d6da535be97b710e87a33c4828c97c36eee21
+Author: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+Date: Tue Jun 23 10:30:04 2009 -0700
+
+ IB/ehca: Bump version number
+
+ Increment version number for DMEM toleration.
+
+ Signed-off-by: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+Index: ofa_kernel-1.5/drivers/infiniband/hw/ehca/ehca_main.c
+===================================================================
+--- ofa_kernel-1.5.orig/drivers/infiniband/hw/ehca/ehca_main.c 2009-07-24 04:33:15.000000000 -0400
++++ ofa_kernel-1.5/drivers/infiniband/hw/ehca/ehca_main.c 2009-07-24 04:34:44.000000000 -0400
+@@ -52,7 +52,7 @@
+ #include "ehca_tools.h"
+ #include "hcp_if.h"
+
+-#define HCAD_VERSION "0026"
++#define HCAD_VERSION "0028"
+
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_AUTHOR("Christoph Raisch <raisch at de.ibm.com>");
Index: ofa_kernel-1.5/kernel_patches/fixes/ehca-0070-port_autodetect_as_default.patch
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ofa_kernel-1.5/kernel_patches/fixes/ehca-0070-port_autodetect_as_default.patch 2009-07-24 04:39:54.000000000 -0400
@@ -0,0 +1,45 @@
+commit 0e71ff3afd4229862da6be540adc0d8de18187ea
+Author: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+Date: Tue Jul 14 21:37:59 2009 -0700
+
+ IB/ehca: Make port autodetect mode the default
+
+ Make port autodetect mode the default for the ehca driver. The
+ autodetect code has been in the kernel for several releases now and
+ has proved to be stable.
+
+ Signed-off-by: Roland Dreier <rolandd at cisco.com>
+
+diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
+index fab18a2..5b635aa 100644
+--- a/drivers/infiniband/hw/ehca/ehca_main.c
++++ b/drivers/infiniband/hw/ehca/ehca_main.c
+@@ -52,7 +52,7 @@
+ #include "ehca_tools.h"
+ #include "hcp_if.h"
+
+-#define HCAD_VERSION "0028"
++#define HCAD_VERSION "0029"
+
+ MODULE_LICENSE("Dual BSD/GPL");
+ MODULE_AUTHOR("Christoph Raisch <raisch at de.ibm.com>");
+@@ -64,7 +64,7 @@ static int ehca_hw_level = 0;
+ static int ehca_poll_all_eqs = 1;
+
+ int ehca_debug_level = 0;
+-int ehca_nr_ports = 2;
++int ehca_nr_ports = -1;
+ int ehca_use_hp_mr = 0;
+ int ehca_port_act_time = 30;
+ int ehca_static_rate = -1;
+@@ -95,8 +95,8 @@ MODULE_PARM_DESC(hw_level,
+ "Hardware level (0: autosensing (default), "
+ "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+ MODULE_PARM_DESC(nr_ports,
+- "number of connected ports (-1: autodetect, 1: port one only, "
+- "2: two ports (default)");
++ "number of connected ports (-1: autodetect (default), "
++ "1: port one only, 2: two ports)");
+ MODULE_PARM_DESC(use_hp_mr,
+ "Use high performance MRs (default: no)");
+ MODULE_PARM_DESC(port_act_time,
More information about the ewg
mailing list