[ofa-general] [PATCH] rds: support for IB_DEVICE_LOCAL_DMA_LKEY

Jon Mason jon at opengridcomputing.com
Thu Jul 31 11:55:06 PDT 2008


For iWARP, there is a limitation where syncs to remote memory need write
permission.  By allowing remote write, there is a potential security risk where
all memory is available to remote clients.  By using the local_dma_lkey, this
removes the necessity of remote write permission on local memory regions.  The
patch below converts the usage of dma_mr's to dma_local_lkey and removes the
allocation of dma_mr's (if IB_DEVICE_LOCAL_DMA_LKEY is supported).

Also, Chelsio has a limitation of not being able to access DMA MR regions that
reside in memory greater that 4GB.  So using the patch, rds bcopy will work on
systems with greater than 4GB RAM.

For IB, using local_dma_lkey removes the need for DMA MR allocations (presuming
that the driver supports IB_DEVICE_LOCAL_DMA_LKEY).

Signed-Off-By: Jon Mason <jon at opengridcomputing.com>

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 4ba4805..8fd8de4 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -85,6 +85,7 @@ void rds_ib_add_one(struct ib_device *device)
 
 	spin_lock_init(&rds_ibdev->spinlock);
 
+	rds_ibdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
 	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
 
@@ -103,18 +104,21 @@ void rds_ib_add_one(struct ib_device *device)
 	if (IS_ERR(rds_ibdev->pd))
 		goto free_dev;
 
-	if (device->node_type != RDMA_NODE_RNIC) {
-		rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-					IB_ACCESS_LOCAL_WRITE);
-	} else {
-		/* Why does it have to have these permissions? */
-		rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-					IB_ACCESS_REMOTE_READ |
-					IB_ACCESS_REMOTE_WRITE |
-					IB_ACCESS_LOCAL_WRITE);
-	}
-	if (IS_ERR(rds_ibdev->mr))
-		goto err_pd;
+	if (!rds_ibdev->dma_local_lkey) {
+		if (device->node_type != RDMA_NODE_RNIC) {
+			rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+						IB_ACCESS_LOCAL_WRITE);
+		} else {
+			/* Why does it have to have these permissions? */
+			rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+						IB_ACCESS_REMOTE_READ |
+						IB_ACCESS_REMOTE_WRITE |
+						IB_ACCESS_LOCAL_WRITE);
+		}
+		if (IS_ERR(rds_ibdev->mr))
+			goto err_pd;
+	} else
+		rds_ibdev->mr = NULL;
 
 	/* Tell the RDMA code to use the fastreg API */
 	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)
@@ -134,7 +138,8 @@ void rds_ib_add_one(struct ib_device *device)
 	goto free_attr;
 
 err_mr:
-	ib_dereg_mr(rds_ibdev->mr);
+	if (!rds_ibdev->dma_local_lkey)
+		ib_dereg_mr(rds_ibdev->mr);
 err_pd:
 	ib_dealloc_pd(rds_ibdev->pd);
 free_dev:
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 9e17075..13eb1f2 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -157,7 +157,8 @@ struct rds_ib_connection {
 	/* Protocol version specific information */
 	unsigned int		i_flowctl : 1,	/* enable/disable flow ctl */
 				i_iwarp   : 1,	/* this is actually iWARP not IB */
-				i_fastreg : 1;	/* device supports fastreg */
+				i_fastreg : 1,	/* device supports fastreg */
+				i_dma_local_lkey : 1;
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
@@ -189,7 +190,8 @@ struct rds_ib_device {
 	unsigned int		max_fmrs;
 	int			max_sge;
 	unsigned int		max_wrs;
-	unsigned int		use_fastreg : 1;
+	unsigned int		use_fastreg : 1,
+				dma_local_lkey : 1;
 	spinlock_t		spinlock;
 };
 
@@ -264,6 +266,10 @@ static void inline rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 }
 #define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
 
+static inline u32 rds_ib_local_dma_lkey(struct rds_ib_connection *ic)
+{
+	return (ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey);
+}
 
 /* ib.c */
 extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 9969504..89be6ba 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -516,6 +516,7 @@ static int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	/* Remember whether this is IB or iWARP */
 	ic->i_iwarp = (cm_id->device->node_type == RDMA_NODE_RNIC);
 	ic->i_fastreg = rds_ibdev->use_fastreg;
+	ic->i_dma_local_lkey = rds_ibdev->dma_local_lkey;
 
  	/* We got halfway through setting up the ib_connection, if we
  	 * fail now, we have to take the long route out of this mess. */
@@ -671,6 +672,7 @@ out:
 int rds_ib_conn_connect(struct rds_connection *conn)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev;
 	struct sockaddr_in src, dest;
 	int ret;
 
@@ -698,8 +700,12 @@ int rds_ib_conn_connect(struct rds_connection *conn)
 		goto out;
 	}
 
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
 	/* Now check the device type and set i_iwarp */
 	ic->i_iwarp = (ic->i_cm_id->device->node_type == RDMA_NODE_RNIC);
+	ic->i_fastreg = rds_ibdev->use_fastreg;
+	ic->i_dma_local_lkey = rds_ibdev->dma_local_lkey;
 
 	dest.sin_family = AF_INET;
 	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 9f72556..6738758 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -97,12 +97,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 		sge = rds_ib_data_sge(ic, recv->r_sge);
 		sge->addr = 0;
 		sge->length = RDS_FRAG_SIZE;
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 
 		sge = rds_ib_header_sge(ic, recv->r_sge);
 		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 	}
 }
 
@@ -364,7 +364,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
 
 	sge->addr = ic->i_ack_dma;
 	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
+	sge->lkey = rds_ib_local_dma_lkey(ic);
 
 	wr->sg_list = sge;
 	wr->num_sge = 1;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 4878d3b..6d4e99d 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -145,12 +145,12 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 		send->s_wr.ex.imm_data = 0;
 
 		sge = rds_ib_data_sge(ic, send->s_sge);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 
 		sge = rds_ib_header_sge(ic, send->s_sge);
 		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 	}
 }
 
@@ -432,7 +432,7 @@ rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
 		sge = rds_ib_data_sge(ic, send->s_sge);
 		sge->addr = buffer;
 		sge->length = length;
-		sge->lkey = ic->i_mr->lkey;
+		sge->lkey = rds_ib_local_dma_lkey(ic);
 
 		sge = rds_ib_header_sge(ic, send->s_sge);
 	} else {
@@ -444,7 +444,7 @@ rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
 
 	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
 	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
+	sge->lkey = rds_ib_local_dma_lkey(ic);
 }
 
 /*
@@ -813,7 +813,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 			len = sg_dma_len(scat);
 			send->s_sge[j].addr = sg_dma_address(scat);
 			send->s_sge[j].length = len;
-			send->s_sge[j].lkey = ic->i_mr->lkey;
+			send->s_sge[j].lkey = rds_ib_local_dma_lkey(ic);
 
 			sent += len;
 			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);




More information about the general mailing list