[ofa-general] [PATCH RFC v4 1/2] RDMA/Core: MEM_MGT_EXTENSIONS support
Steve Wise
swise at opengridcomputing.com
Tue May 27 11:35:49 PDT 2008
Support for the IB BMME and iWARP equivalent memory extensions to
non shared memory regions. This includes:
- allocation of an ib_mr for use in fast register work requests
- device-specific alloc/free of physical buffer lists for use in fast
register work requests. This allows devices to allocate this memory as
needed (like via dma_alloc_coherent).
- fast register memory region work request
- invalidate local memory region work request
- read with invalidate local memory region work request (iWARP only)
Design details:
- New device capability flag added: IB_DEVICE_MEM_MGT_EXTENSIONS indicates
device support for this feature.
- New send WR opcode IB_WR_FAST_REG_MR used to issue a fast_reg request.
- New send WR opcode IB_WR_INVALIDATE_MR used to invalidate a fast_reg mr.
- New API function, ib_alloc_mr() used to allocate fast_reg memory
regions.
- New API function, ib_alloc_fast_reg_page_list to allocate
device-specific page lists.
- New API function, ib_free_fast_reg_page_list to free said page lists.
- New API function, ib_update_fast_reg_key to allow the key portion of
the R_Key and L_Key of a fast_reg MR to be updated. Applications call
this if desired before posting the IB_WR_FAST_REG_MR.
Usage Model:
- MR allocated with ib_alloc_mr()
- Page lists allocated via ib_alloc_fast_reg_page_list().
- MR R_Key/L_Key "key" field updated with ib_update_fast_reg_key().
- MR made VALID and bound to a specific page list via
ib_post_send(IB_WR_FAST_REG_MR)
- MR made INVALID via ib_post_send(IB_WR_INVALIDATE_MR)
- MR deallocated with ib_dereg_mr()
- page lists dealloced via ib_free_fast_reg_page_list().
Applications can allocate a fast_reg mr once, and then can repeatedly
bind the mr to different physical memory SGLs via posting work
requests to the For each outstanding mr-to-pbl binding in the SQ pipe,
a fast_reg_page_list needs to be allocated. Thus pipelining can be
achieved while still allowing device-specific page_list processing.
The 4B fast_reg rkey or stag is composed of a 3B index, and a 1B key.
The application can change the key each time it fast-registers thus
allowing more control over the peer's use of the rkey (ie it can
effectively be changed each time the rkey is rebound to a page list).
Signed-off-by: Steve Wise <swise at opengridcomputing.com>
---
drivers/infiniband/core/verbs.c | 46 ++++++++++++++++++++++++
include/rdma/ib_verbs.h | 76 +++++++++++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 0 deletions(-)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 0504208..0a334b4 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -755,6 +755,52 @@ int ib_dereg_mr(struct ib_mr *mr)
}
EXPORT_SYMBOL(ib_dereg_mr);
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->alloc_fast_reg_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+ struct ib_device *device, int max_page_list_len)
+{
+ struct ib_fast_reg_page_list *page_list;
+
+ if (!device->alloc_fast_reg_page_list)
+ return ERR_PTR(-ENOSYS);
+
+ page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+ if (!IS_ERR(page_list)) {
+ page_list->device = device;
+ page_list->max_page_list_len = max_page_list_len;
+ }
+
+ return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+ page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
/* Memory windows */
struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 911a661..ede0c80 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -106,6 +106,7 @@ enum ib_device_cap_flags {
IB_DEVICE_UD_IP_CSUM = (1<<18),
IB_DEVICE_UD_TSO = (1<<19),
IB_DEVICE_SEND_W_INV = (1<<21),
+ IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<22),
};
enum ib_atomic_cap {
@@ -151,6 +152,7 @@ struct ib_device_attr {
int max_srq;
int max_srq_wr;
int max_srq_sge;
+ unsigned int max_fast_reg_page_list_len;
u16 max_pkeys;
u8 local_ca_ack_delay;
};
@@ -414,6 +416,8 @@ enum ib_wc_opcode {
IB_WC_FETCH_ADD,
IB_WC_BIND_MW,
IB_WC_LSO,
+ IB_WC_FAST_REG_MR,
+ IB_WC_INVALIDATE_MR,
/*
* Set value of IB_WC_RECV so consumers can test if a completion is a
* receive by testing (opcode & IB_WC_RECV).
@@ -628,6 +632,9 @@ enum ib_wr_opcode {
IB_WR_ATOMIC_FETCH_AND_ADD,
IB_WR_LSO,
IB_WR_SEND_WITH_INV,
+ IB_WR_FAST_REG_MR,
+ IB_WR_INVALIDATE_MR,
+ IB_WR_READ_WITH_INV,
};
enum ib_send_flags {
@@ -676,6 +683,19 @@ struct ib_send_wr {
u16 pkey_index; /* valid for GSI only */
u8 port_num; /* valid for DR SMPs on switch only */
} ud;
+ struct {
+ u64 iova_start;
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *page_list;
+ unsigned int page_shift;
+ unsigned int page_list_len;
+ unsigned int first_byte_offset;
+ u32 length;
+ int access_flags;
+ } fast_reg;
+ struct {
+ struct ib_mr *mr;
+ } local_inv;
} wr;
};
@@ -1014,6 +1034,10 @@ struct ib_device {
int (*query_mr)(struct ib_mr *mr,
struct ib_mr_attr *mr_attr);
int (*dereg_mr)(struct ib_mr *mr);
+ struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd,
+ int max_page_list_len);
+ struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, int page_list_len);
+ void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
int (*rereg_phys_mr)(struct ib_mr *mr,
int mr_rereg_mask,
struct ib_pd *pd,
@@ -1808,6 +1832,58 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
int ib_dereg_mr(struct ib_mr *mr);
/**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list size to be allocated.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+struct ib_fast_reg_page_list {
+ struct ib_device *device;
+ u64 *page_list;
+ unsigned int max_page_list_len;
+};
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list *
+ * and a page_list array that is at least page_list_len in size.
+ * The actual size is returned in max_page_list_len.
+ * The caller is responsible for initializing the contents of the
+ * page_list array before posting a send work request with the
+ * IB_WC_FAST_REG_MR opcode. The page_list array entries must be
+ * translated using one of the ib_dma_*() functions similar to the
+ * addresses passed to ib_map_phys_fmr(). Once the ib_post_send()
+ * is issued, the struct ib_fast_reg_page_list must not be modified
+ * by the caller until a completion notice is returned by the device.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+ struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ * page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg
+ * R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+ mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+ mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
* ib_alloc_mw - Allocates a memory window.
* @pd: The protection domain associated with the memory window.
*/
More information about the general
mailing list