[ofa-general] [PATCH RFC v2] RDMA: New Memory Extensions.

Steve Wise swise at opengridcomputing.com
Thu May 15 11:17:34 PDT 2008


The following patch proposes the API and core changes needed to
implement the IB BMMR and iWARP equivalient memory extensions.

Please review these vs the verbs specs and see what I've missed.

This patch is a request for comments.

Steve.

Changes since Version 1:
	- ib_alloc_mr() -> ib_alloc_fast_reg_mr()
	- pbl_depth -> max_page_list_len
	- page_list_len -> max_page_list_len where it makes sense
	- int -> unsigned int where needed
	- fbo -> first_byte_offset
	- added page size and page_list_len to fast_reg union in ib_send_wr
	- rearranged work request fast_reg union of ib_send_wr to pack it
	- dropped remove_access parameter from ib_alloc_fast_reg_mr()
	- IB_DEVICE_MM_EXTENSIONS -> IB_DEVICE_MEM_MGT_EXTENSIONS
	- compiled

-----

RDMA: New Memory Extensions.

Support for the IB BMME and iWARP equivalent memory extensions to 
non shared memory regions.  This includes:

- allocation of an ib_mr for use in fast register work requests

- device-specific alloc/free of physical buffer lists for use in fast
register work requests.  This allows devices to allocate this memory as
needed (like via dma_alloc_coherent).

- fast register memory region work request

- invalidate local memory region work request

- read with invalidate local memory region work request (iWARP only)


Design details:

- New device capability flag added: IB_DEVICE_MEMORY_EXTENSIONS indicates
device support for this feature.

- New send WR opcode IB_WR_FAST_REG_MR used to issue a fast_reg request.

- New send WR opcode IB_WR_INVALIDATE_MR used to invalidate a fast_reg mr.

- New API function, ib_alloc_mr() used to allocate fast_reg memory
regions.

- New API function, ib_alloc_fast_reg_page_list to allocate
device-specific page lists.

- New API function, ib_free_fast_reg_page_list to free said page lists.


Usage Model:

- MR allocated with ib_alloc_mr()

- Page lists allocated via ib_alloc_fast_reg_page_list().

- MR made VALID and bound to a specific page list via
ib_post_send(IB_WR_FAST_REG_MR)

- MR made INVALID via ib_post_send(IB_WR_INVALIDATE_MR)

- MR deallocated with ib_dereg_mr()

- page lists dealloced via ib_free_fast_reg_page_list().

Applications can allocate a fast_reg mr once, and then can repeatedly
bind the mr to different physical memory SGLs via posting work requests
to the send queue.  For each outstanding mr-to-pbl binding in the SQ
pipe, a fast_reg_page_list needs to be allocated.  Thus pipelining can
be achieved while still allowing device-specific page_list processing.
---

 drivers/infiniband/core/verbs.c |   46 +++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h         |   55 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 0504208..0a334b4 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -755,6 +755,52 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+				struct ib_device *device, int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
 /* Memory windows */
 
 struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 911a661..cbef5a6 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -106,6 +106,7 @@ enum ib_device_cap_flags {
 	IB_DEVICE_UD_IP_CSUM		= (1<<18),
 	IB_DEVICE_UD_TSO		= (1<<19),
 	IB_DEVICE_SEND_W_INV		= (1<<21),
+	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<22),
 };
 
 enum ib_atomic_cap {
@@ -414,6 +415,8 @@ enum ib_wc_opcode {
 	IB_WC_FETCH_ADD,
 	IB_WC_BIND_MW,
 	IB_WC_LSO,
+	IB_WC_FAST_REG_MR,
+	IB_WC_INVALIDATE_MR,
 /*
  * Set value of IB_WC_RECV so consumers can test if a completion is a
  * receive by testing (opcode & IB_WC_RECV).
@@ -628,6 +631,9 @@ enum ib_wr_opcode {
 	IB_WR_ATOMIC_FETCH_AND_ADD,
 	IB_WR_LSO,
 	IB_WR_SEND_WITH_INV,
+	IB_WR_FAST_REG_MR,
+	IB_WR_INVALIDATE_MR,
+	IB_WR_READ_WITH_INV,
 };
 
 enum ib_send_flags {
@@ -676,6 +682,20 @@ struct ib_send_wr {
 			u16	pkey_index; /* valid for GSI only */
 			u8	port_num;   /* valid for DR SMPs on switch only */
 		} ud;
+		struct {
+			u64				iova_start;
+			struct ib_mr 			*mr;
+			struct ib_fast_reg_page_list	*page_list;
+			unsigned int			page_size;
+			unsigned int			page_list_len;
+			unsigned int			first_byte_offset;
+			u32				length;
+			int				access_flags;
+			
+		} fast_reg;
+		struct {
+			struct ib_mr 	*mr;
+		} local_inv;
 	} wr;
 };
 
@@ -1014,6 +1034,10 @@ struct ib_device {
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
+	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
+					       int max_page_list_len);
+	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, int page_list_len);
+	void			   (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
 	int                        (*rereg_phys_mr)(struct ib_mr *mr,
 						    int mr_rereg_mask,
 						    struct ib_pd *pd,
@@ -1808,6 +1832,37 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
 int ib_dereg_mr(struct ib_mr *mr);
 
 /**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list size to be allocated.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+struct ib_fast_reg_page_list {
+	struct ib_device 	*device;
+	u64			*page_list;
+	unsigned int		max_page_list_len;
+};
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array to be used
+ * in a IB_WR_FAST_REG_MR work request.  The resources allocated by this method
+ * allows for dev-specific optimization of the FAST_REG operation.
+ * @device - ib device pointer.
+ * @page_list_len - depth of the page list array to be allocated.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+				struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ * page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
  * ib_alloc_mw - Allocates a memory window.
  * @pd: The protection domain associated with the memory window.
  */



More information about the general mailing list