[ofa-general] [PATCH] RDMA/nes: Improved use of pbls

Chien Tung chien.tin.tung at intel.com
Wed Jan 21 09:11:08 PST 2009


From: Don Wood <donald.e.wood at intel.com>

Two level 256 byte pbls was not implemented so the driver could
report out of memory when in fact there were pbls still available.

The solution prefers to use 4KB pbls over two level 256B pbls until
the number of 4KB pbls falls below a threshold.  At this point the
4KB pbl structure is converted to use 256B pbls which prevents the
driver from running out of 4KB pbls too quickly.

Also, fixed two places where the software pbl counts were changed
before the hardware was updated.  This bug allowed another thread
to overallocate the hardware resources.

Signed-off-by: Don Wood <donald.e.wood at intel.com>
---
Sorry, the subject line in the last email was garbled.

Chien

 drivers/infiniband/hw/nes/nes_verbs.c |  247 ++++++++++++++++++++++----------
 1 files changed, 170 insertions(+), 77 deletions(-)

diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 4cfb4d9..488e981 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -551,6 +551,7 @@ static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	int i = 0;
+	int rc;
 
 	/* free the resources */
 	if (nesfmr->leaf_pbl_cnt == 0) {
@@ -572,6 +573,8 @@ static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
 	nesmr->ibmw.rkey = ibfmr->rkey;
 	nesmr->ibmw.uobject = NULL;
 
+	rc = nes_dealloc_mw(&nesmr->ibmw);
+
 	if (nesfmr->nesmr.pbls_used != 0) {
 		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
 		if (nesfmr->nesmr.pbl_4k) {
@@ -584,7 +587,7 @@ static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
 		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
 	}
 
-	return nes_dealloc_mw(&nesmr->ibmw);
+	return rc;
 }
 
 
@@ -1884,21 +1887,75 @@ static int nes_destroy_cq(struct ib_cq *ib_cq)
 	return ret;
 }
 
+/**
+ * root_256
+ */
+static u32 root_256(struct nes_device *nesdev,
+		    struct nes_root_vpbl *root_vpbl,
+		    struct nes_root_vpbl *new_root,
+		    u16 pbl_count_4k,
+		    u16 pbl_count_256)
+{
+	u64 leaf_pbl;
+	int i, j, k;
+
+	if (pbl_count_4k == 1) {
+		new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+						512, &new_root->pbl_pbase);
+
+		if (new_root->pbl_vbase == NULL)
+			return 0;
+
+		leaf_pbl = (u64)root_vpbl->pbl_pbase;
+		for (i = 0; i < 16; i++) {
+			new_root->pbl_vbase[i].pa_low =
+				cpu_to_le32((u32)leaf_pbl);
+			new_root->pbl_vbase[i].pa_high =
+				cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			leaf_pbl += 256;
+		}
+	} else {
+		for (i = 3; i >= 0; i--) {
+			j = i * 16;
+			root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
+			leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
+			    (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
+				<< 32);
+			for (k = 1; k < 16; k++) {
+				leaf_pbl += 256;
+				root_vpbl->pbl_vbase[j + k].pa_low =
+						cpu_to_le32((u32)leaf_pbl);
+				root_vpbl->pbl_vbase[j + k].pa_high =
+				    cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			}
+		}
+	}
+
+	return 1;
+}
+
 
 /**
  * nes_reg_mr
  */
 static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
-		dma_addr_t single_buffer, u16 pbl_count, u16 residual_page_count,
-		int acc, u64 *iova_start)
+		dma_addr_t single_buffer, u16 pbl_count_4k,
+		u16 residual_page_count_4k, int acc, u64 *iova_start,
+		u16 *actual_pbl_cnt, u8 *used_4k_pbls)
 {
 	struct nes_hw_cqp_wqe *cqp_wqe;
 	struct nes_cqp_request *cqp_request;
 	unsigned long flags;
 	int ret;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	/* int count; */
+	uint pg_cnt = 0;
+	u16 pbl_count_256;
+	u16 pbl_count = 0;
+	u8  use_256_pbls = 0;
+	u8  use_4k_pbls = 0;
+	u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
+	struct nes_root_vpbl new_root = {0, 0, 0};
 	u32 opcode = 0;
 	u16 major_code;
 
@@ -1911,41 +1968,70 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	cqp_request->waiting = 1;
 	cqp_wqe = &cqp_request->cqp_wqe;
 
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	/* track PBL resources */
-	if (pbl_count != 0) {
-		if (pbl_count > 1) {
-			/* Two level PBL */
-			if ((pbl_count+1) > nesadapter->free_4kpbl) {
-				nes_debug(NES_DBG_MR, "Out of 4KB Pbls for two level request.\n");
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				return -ENOMEM;
-			} else {
-				nesadapter->free_4kpbl -= pbl_count+1;
-			}
-		} else if (residual_page_count > 32) {
-			if (pbl_count > nesadapter->free_4kpbl) {
-				nes_debug(NES_DBG_MR, "Out of 4KB Pbls.\n");
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				return -ENOMEM;
-			} else {
-				nesadapter->free_4kpbl -= pbl_count;
+	if (pbl_count_4k) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+		pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
+		pbl_count_256 = (pg_cnt + 31) / 32;
+		if (pg_cnt <= 32) {
+			if (pbl_count_256 <= nesadapter->free_256pbl)
+				use_256_pbls = 1;
+			else if (pbl_count_4k <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		} else if (pg_cnt <= 2048) {
+			if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
+			    (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
+				use_4k_pbls = 1;
+			} else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
+				use_256_pbls = 1;
+				use_two_level = 1;
+			} else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				use_4k_pbls = 1;
 			}
 		} else {
-			if (pbl_count > nesadapter->free_256pbl) {
-				nes_debug(NES_DBG_MR, "Out of 256B Pbls.\n");
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				return -ENOMEM;
-			} else {
-				nesadapter->free_256pbl -= pbl_count;
-			}
+			if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		}
+
+		if (use_256_pbls) {
+			pbl_count = pbl_count_256;
+			nesadapter->free_256pbl -= pbl_count + use_two_level;
+		} else if (use_4k_pbls) {
+			pbl_count =  pbl_count_4k;
+			nesadapter->free_4kpbl -= pbl_count + use_two_level;
+		} else {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			nes_debug(NES_DBG_MR, "Out of Pbls\n");
+			nes_free_cqp_request(nesdev, cqp_request);
+			return -ENOMEM;
 		}
+
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
 	}
 
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	if (use_256_pbls && use_two_level) {
+		if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k, pbl_count_256) == 1) {
+			if (new_root.pbl_pbase != 0)
+				root_vpbl = &new_root;
+		} else {
+			spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+			nesadapter->free_256pbl += pbl_count_256 + use_two_level;
+			use_256_pbls = 0;
+
+			if (pbl_count_4k == 1)
+				use_two_level = 0;
+			pbl_count = pbl_count_4k;
+
+			if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				nesadapter->free_4kpbl -= pbl_count + use_two_level;
+				use_4k_pbls = 1;
+			}
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+			if (use_4k_pbls == 0)
+				return -ENOMEM;
+		}
+	}
 
 	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
 					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
@@ -1974,10 +2060,9 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	} else {
 		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
 		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX,
-				(((pbl_count - 1) * 4096) + (residual_page_count*8)));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
 
-		if ((pbl_count > 1) || (residual_page_count > 32))
+		if (use_4k_pbls)
 			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
 	}
 	barrier();
@@ -1994,13 +2079,25 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	major_code = cqp_request->major_code;
 	nes_put_cqp_request(nesdev, cqp_request);
 
+	if ((!ret || major_code) && pbl_count != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (use_256_pbls)
+			nesadapter->free_256pbl += pbl_count + use_two_level;
+		else if (use_4k_pbls)
+			nesadapter->free_4kpbl += pbl_count + use_two_level;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	if (new_root.pbl_pbase)
+		pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
+				    new_root.pbl_pbase);
+
 	if (!ret)
 		return -ETIME;
 	else if (major_code)
 		return -EIO;
-	else
-		return 0;
 
+	*actual_pbl_cnt = pbl_count + use_two_level;
+	*used_4k_pbls = use_4k_pbls;
 	return 0;
 }
 
@@ -2165,18 +2262,14 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 		pbl_count = root_pbl_index;
 	}
 	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
-			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start);
+			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+			&nesmr->pbls_used, &nesmr->pbl_4k);
 
 	if (ret == 0) {
 		nesmr->ibmr.rkey = stag;
 		nesmr->ibmr.lkey = stag;
 		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
 		ibmr = &nesmr->ibmr;
-		nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0;
-		nesmr->pbls_used = pbl_count;
-		if (pbl_count > 1) {
-			nesmr->pbls_used++;
-		}
 	} else {
 		kfree(nesmr);
 		ibmr = ERR_PTR(-ENOMEM);
@@ -2454,8 +2547,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 					stag, (unsigned int)iova_start,
 					(unsigned int)region_length, stag_index,
 					(unsigned long long)region->length, pbl_count);
-			ret = nes_reg_mr( nesdev, nespd, stag, region->length, &root_vpbl,
-					first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, &iova_start);
+			ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
+					 first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
+					 &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
 
 			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
 
@@ -2464,11 +2558,6 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				nesmr->ibmr.lkey = stag;
 				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
 				ibmr = &nesmr->ibmr;
-				nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0;
-				nesmr->pbls_used = pbl_count;
-				if (pbl_count > 1) {
-					nesmr->pbls_used++;
-				}
 			} else {
 				ib_umem_release(region);
 				kfree(nesmr);
@@ -2607,24 +2696,6 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 	cqp_request->waiting = 1;
 	cqp_wqe = &cqp_request->cqp_wqe;
 
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	if (nesmr->pbls_used != 0) {
-		if (nesmr->pbl_4k) {
-			nesadapter->free_4kpbl += nesmr->pbls_used;
-			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
-				printk(KERN_ERR PFX "free 4KB PBLs(%u) has exceeded the max(%u)\n",
-						nesadapter->free_4kpbl, nesadapter->max_4kpbl);
-			}
-		} else {
-			nesadapter->free_256pbl += nesmr->pbls_used;
-			if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
-				printk(KERN_ERR PFX "free 256B PBLs(%u) has exceeded the max(%u)\n",
-						nesadapter->free_256pbl, nesadapter->max_256pbl);
-			}
-		}
-	}
-
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
 	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
 	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
 			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
@@ -2642,11 +2713,6 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
 			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
 
-	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-			(ib_mr->rkey & 0x0fffff00) >> 8);
-
-	kfree(nesmr);
-
 	major_code = cqp_request->major_code;
 	minor_code = cqp_request->minor_code;
 
@@ -2662,8 +2728,35 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
 				major_code, minor_code, ib_mr, ib_mr->rkey);
 		return -EIO;
-	} else
-		return 0;
+	}
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nesmr->pbls_used != 0) {
+		if (nesmr->pbl_4k) {
+			nesadapter->free_4kpbl += nesmr->pbls_used;
+			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
+				printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
+				       "exceeded the max(%u)\n",
+				       nesadapter->free_4kpbl,
+				       nesadapter->max_4kpbl);
+		} else {
+			nesadapter->free_256pbl += nesmr->pbls_used;
+			if (nesadapter->free_256pbl > nesadapter->max_256pbl)
+				printk(KERN_ERR PFX "free 256B PBLs(%u) has "
+				       "exceeded the max(%u)\n",
+				       nesadapter->free_256pbl,
+				       nesadapter->max_256pbl);
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			  (ib_mr->rkey & 0x0fffff00) >> 8);
+
+	kfree(nesmr);
+
+	return 0;
 }
 
 
-- 
1.5.3.3




More information about the general mailing list