[openib-general] PATCH] enhancement to rdma_bw and rdma_lat to utilize the RDMA CM

Steve Wise swise at opengridcomputing.com
Tue May 16 06:59:39 PDT 2006


I don't know who maintains src/userspace/perftest, but here is a patch
set that enables rdma_bw and rdma_lat to use the RDMA_CM with the
addition of the -c or --cma flag.

The rkey/addr info is exchanged in the private data, and SEND/RECV's are
used to sync the client/server before and after execution.

Also, I added -P or --poll to rdma_bw to allow blocking for completion
events when none are ready (if you omit -P, it will block when no
completion is available, otherwise it will spin).

Signed-off-by: Steve Wise <swise at opengridcomputing.com>



Index: rdma_lat.c
===================================================================
--- rdma_lat.c	(revision 7050)
+++ rdma_lat.c	(working copy)
@@ -53,6 +53,7 @@
 #include <time.h>
 
 #include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
 
 #include "get_clock.h"
 
@@ -71,7 +72,8 @@
 	struct ibv_context *context;
 	struct ibv_pd      *pd;
 	struct ibv_mr      *mr;
-	struct ibv_cq      *cq;
+	struct ibv_cq      *scq;
+	struct ibv_cq      *rcq;
 	struct ibv_qp      *qp;
 	void               *buf;
 	volatile char      *post_buf;
@@ -80,6 +82,7 @@
 	int                 tx_depth;
 	struct ibv_sge list;
 	struct ibv_send_wr wr;
+	struct rdma_cm_id  *cm_id;
 };
 
 struct pingpong_dest {
@@ -323,16 +326,22 @@
 		return NULL;
 	}
 
-	ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, NULL, 0);
-	if (!ctx->cq) {
+	ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
+	if (!ctx->rcq) {
 		fprintf(stderr, "Couldn't create CQ\n");
 		return NULL;
 	}
 
+	ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, NULL, 0);
+	if (!ctx->scq) {
+		fprintf(stderr, "Couldn't create CQ\n");
+		return NULL;
+	}
+
 	{
 		struct ibv_qp_init_attr attr = {
-			.send_cq = ctx->cq,
-			.recv_cq = ctx->cq,
+			.send_cq = ctx->scq,
+			.recv_cq = ctx->rcq,
 			.cap     = {
 				.max_send_wr  = tx_depth,
 				/* Work around:  driver doesnt support
@@ -370,13 +379,6 @@
 		}
 	}
 
-	ctx->wr.wr_id      = PINGPONG_RDMA_WRID;
-	ctx->wr.sg_list    = &ctx->list;
-	ctx->wr.num_sge    = 1;
-	ctx->wr.opcode     = IBV_WR_RDMA_WRITE;
-	ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
-	ctx->wr.next       = NULL;
-
 	return ctx;
 }
 
@@ -489,6 +491,467 @@
 	return 0;
 }
 
+/* CMA STUFF */
+
+static void pp_post_recv(struct pingpong_context *ctx)
+{
+	struct ibv_sge list;
+	struct ibv_recv_wr wr, *bad_wr;
+	int rc;
+
+	list.addr = (uintptr_t) ctx->buf;
+	list.length = 1;
+	list.lkey = ctx->mr->lkey;
+	wr.next = NULL;
+	wr.wr_id = 0xdeadbeef;
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+
+	rc = ibv_post_recv(ctx->qp, &wr, &bad_wr);
+	if (rc) {
+		perror("ibv_post_recv");
+		fprintf(stderr, "%s ibv_post_recv failed %d\n", __FUNCTION__, rc);
+	}
+}
+
+static struct pingpong_context *pp_init_cma_ctx(struct rdma_cm_id *cm_id,
+					    unsigned size,
+					    int tx_depth, int port)
+{
+	struct pingpong_context *ctx;
+
+	ctx = malloc(sizeof *ctx);
+	if (!ctx)
+		return NULL;
+
+	ctx->size     = size;
+	ctx->tx_depth = tx_depth;
+
+	ctx->buf = memalign(page_size, size * 2);
+	if (!ctx->buf) {
+		fprintf(stderr, "Couldn't allocate work buf.\n");
+		return NULL;
+	}
+
+	memset(ctx->buf, 0, size * 2);
+
+	ctx->post_buf = (char*)ctx->buf + (size - 1);
+	ctx->poll_buf = (char*)ctx->buf + (2 * size - 1);
+
+	ctx->cm_id = cm_id;
+	ctx->context = cm_id->verbs;
+	if (!ctx->context) {
+		fprintf(stderr, "%s Unbound cm_id!!\n", __FUNCTION__);
+		return NULL;
+	}
+
+	ctx->pd = ibv_alloc_pd(ctx->context);
+	if (!ctx->pd) {
+		fprintf(stderr, "Couldn't allocate PD\n");
+		return NULL;
+	}
+
+        /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+         * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+         * a Memory Region that has not been assigned Local Write. */
+	ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+			     IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+	if (!ctx->mr) {
+		fprintf(stderr, "Couldn't allocate MR\n");
+		return NULL;
+	}
+
+	ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
+	if (!ctx->rcq) {
+		fprintf(stderr, "Couldn't create RCQ\n");
+		return NULL;
+	}
+
+	ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, NULL, 0);
+	if (!ctx->scq) {
+		fprintf(stderr, "Couldn't create SCQ\n");
+		return NULL;
+	}
+
+	{
+		struct ibv_qp_init_attr attr = {
+			.send_cq = ctx->scq,
+			.recv_cq = ctx->rcq,
+			.cap     = {
+				.max_send_wr  = tx_depth,
+				.max_recv_wr  = 1,
+				.max_send_sge = 1,
+				.max_recv_sge = 1,
+				.max_inline_data = size
+			},
+			.qp_type = IBV_QPT_RC
+		};
+
+		if (rdma_create_qp(ctx->cm_id, ctx->pd, &attr)) {
+			fprintf(stderr, "Couldn't create QP\n");
+			return NULL;
+		}
+		ctx->qp = ctx->cm_id->qp;
+	}
+
+	pp_post_recv(ctx);
+
+	return ctx;
+}
+
+static void pp_close_cma(struct pingpong_context *ctx, const char *servername)
+{
+	struct rdma_cm_event *event;
+	int rc;
+
+	if (servername) {
+		rc = rdma_disconnect(ctx->cm_id);
+		if (rc) {
+			perror("rdma_disconnect");
+			return;
+		}
+	}
+	
+	rdma_get_cm_event(&event);
+	if (event->event != RDMA_CM_EVENT_DISCONNECTED)
+		printf("unexpected event during disconnect %d\n", event->event);
+	rdma_ack_cm_event(event);
+	rdma_destroy_id(ctx->cm_id);
+}
+
+static struct pingpong_context *pp_server_connect_cma(unsigned short port, int size, int tx_depth,
+						      struct pingpong_dest *my_dest, 
+						      struct pingpong_dest *rem_dest)
+{
+	struct rdma_cm_id *listen_id;
+	struct rdma_cm_event *event;
+	struct rdma_conn_param conn_param;
+	int ret;
+	struct sockaddr_in sin;
+	struct rdma_cm_id *child_cm_id;
+	struct pingpong_context *ctx;
+
+	printf("%s starting server\n", __FUNCTION__);
+	ret = rdma_create_id(&listen_id, NULL);
+	if (ret) {
+		fprintf(stderr, "%s rdma_create_id failed %d\n", __FUNCTION__, ret);
+		return NULL;
+	}
+
+	sin.sin_addr.s_addr = 0;
+	sin.sin_family = PF_INET;
+	sin.sin_port = htons(port);
+	ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin);
+	if (ret) {
+		fprintf(stderr,"%s rdma_bind_addr failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_listen(listen_id, 0);
+	if (ret) {
+		fprintf(stderr,"%s rdma_listen failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+		fprintf(stderr,"%s bad event waiting for connect request %d\n", __FUNCTION__, 
+		       event->event);
+		goto err1;
+	}
+
+	if (!event->private_data || (event->private_data_len < sizeof(*rem_dest))) {
+		ret = 1;
+		fprintf(stderr,"%s bad private data len %d\n", __FUNCTION__, 
+		       event->private_data_len);
+		goto err1;
+	}
+	
+	memcpy(rem_dest, event->private_data, sizeof(*rem_dest));
+	child_cm_id = (struct rdma_cm_id *)event->id;
+	ctx = pp_init_cma_ctx(child_cm_id, size, tx_depth, port);
+
+	if (!ctx) {
+		fprintf(stderr,"%s pp_init_cma_ctx failed\n", __FUNCTION__);
+		goto err0;
+	}
+
+	my_dest->qpn = 0;
+	my_dest->psn = 0xbb;
+	my_dest->rkey = ctx->mr->rkey;
+	my_dest->vaddr = (uintptr_t)ctx->buf + ctx->size;
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 1;
+	conn_param.initiator_depth = 1;
+	conn_param.private_data = my_dest;
+	conn_param.private_data_len = sizeof(*my_dest);
+	ret = rdma_accept(ctx->cm_id, &conn_param);
+	if (ret) {
+		fprintf(stderr,"%s rdma_accept failed %d\n", __FUNCTION__, ret);
+		goto err0;
+	}
+	rdma_ack_cm_event(event);
+	ret = rdma_get_cm_event(&event);
+	if (ret) {
+		fprintf(stderr,"rdma_get_cm_event error %d\n", ret);
+		rdma_destroy_id(child_cm_id);
+		goto err2;
+	}
+	if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+		fprintf(stderr,"%s bad event waiting for established %d\n", __FUNCTION__, 
+		       event->event);
+		goto err0;
+	}
+	rdma_ack_cm_event(event);
+	fprintf(stderr,"%s connected!\n", __FUNCTION__);
+	return ctx;
+err0:
+	rdma_destroy_id(child_cm_id);
+err1:
+	rdma_ack_cm_event(event);
+err2:
+	rdma_destroy_id(listen_id);
+	fprintf(stderr,"%s NOT connected!\n", __FUNCTION__);
+	return NULL;
+}
+
+static unsigned get_dst_addr(const char *dst)
+{
+	struct addrinfo *res;
+	int ret;
+	unsigned addr;
+
+	ret = getaddrinfo(dst, NULL, NULL, &res);
+	if (ret) {
+		fprintf(stderr, "%s getaddrinfo failed - invalid hostname or IP address\n",
+			__FUNCTION__);
+		return 0;
+	}
+
+	if (res->ai_family != PF_INET) {
+		return 0;
+	}
+
+	addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
+	freeaddrinfo(res);
+	return addr;
+}
+
+static struct pingpong_context *pp_client_connect_cma(const char *servername, 
+						      unsigned short port, int size, int tx_depth, 
+						      struct pingpong_dest *my_dest, 
+						      struct pingpong_dest *rem_dest)
+{
+	struct rdma_cm_event *event;
+	struct rdma_conn_param conn_param;
+	int ret;
+	struct sockaddr_in sin;
+	struct rdma_cm_id *cm_id;
+	struct pingpong_context *ctx;
+
+	fprintf(stderr,"%s starting client\n", __FUNCTION__);
+	sin.sin_addr.s_addr = get_dst_addr(servername);
+	if (!sin.sin_addr.s_addr) {
+		return NULL;
+	}
+
+	ret = rdma_create_id(&cm_id, NULL);
+	if (ret) {
+		fprintf(stderr,"%s rdma_create_id failed %d\n", __FUNCTION__, ret);
+		return NULL;
+	}
+
+	sin.sin_family = PF_INET;
+	sin.sin_port = htons(port);
+	ret = rdma_resolve_addr(cm_id, NULL, (struct sockaddr *)&sin, 2000);
+	if (ret) {
+		fprintf(stderr,"%s rdma_resolve_addr failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
+		fprintf(stderr,"%s/%d unexpected CM event %d\n", __FUNCTION__, __LINE__,
+		       event->event);
+		goto err1;
+	}
+	rdma_ack_cm_event(event);
+
+	ret = rdma_resolve_route(cm_id, 2000);
+	if (ret) {
+		fprintf(stderr,"%s rdma_resolve_route failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
+		fprintf(stderr,"%s/%d unexpected CM event %d\n", __FUNCTION__, __LINE__,
+		       event->event);
+		goto err1;
+	}
+	rdma_ack_cm_event(event);
+
+	ctx = pp_init_cma_ctx(cm_id, size, tx_depth, port);
+
+	if (!ctx) {
+		fprintf(stderr,"%s pp_init_cma_ctx failed\n", __FUNCTION__);
+		goto err2;
+	}
+
+	my_dest->qpn = 0;
+	my_dest->psn = 0xaa;
+	my_dest->rkey = ctx->mr->rkey;
+	my_dest->vaddr = (uintptr_t)ctx->buf + ctx->size;
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 1;
+	conn_param.initiator_depth = 1;
+	conn_param.retry_count = 5;
+	conn_param.private_data = my_dest;
+	conn_param.private_data_len = sizeof(*my_dest);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	if (ret) {
+		fprintf(stderr,"%s rdma_connect failure %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+		fprintf(stderr,"%s/%d unexpected CM event %d\n", __FUNCTION__, __LINE__,
+		       event->event);
+		goto err1;
+	}
+	if (!event->private_data || (event->private_data_len < sizeof(*rem_dest))) {
+		fprintf(stderr,"%s bad private data ptr %p len %d\n", __FUNCTION__,
+		       event->private_data, event->private_data_len);
+		goto err1;
+	}
+
+	memcpy(rem_dest, event->private_data, sizeof(*rem_dest));
+	rdma_ack_cm_event(event);
+	fprintf(stderr,"connected!\n");
+	return ctx;
+err1:
+	rdma_ack_cm_event(event);
+err2:
+	fprintf(stderr,"NOT connected!\n");
+	rdma_destroy_id(cm_id);
+	return NULL;
+}
+
+static void pp_wait_for_done(struct pingpong_context *ctx)
+{
+	struct ibv_wc wc;
+	int ne;
+
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (!(wc.opcode & IBV_WC_RECV))
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xdeadbeef) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+}
+
+static void pp_send_done(struct pingpong_context *ctx)
+{
+	struct ibv_send_wr *bad_wr;
+	int rc;
+	struct ibv_wc wc;
+	int ne;
+
+	ctx->list.addr = (uintptr_t) ctx->buf;
+	ctx->list.length = 1;
+	ctx->list.lkey = ctx->mr->lkey;
+	ctx->wr.wr_id      = 0xcafebabe;
+	ctx->wr.sg_list    = &ctx->list;
+	ctx->wr.num_sge    = 1;
+	ctx->wr.opcode     = IBV_WR_SEND;
+	ctx->wr.send_flags = IBV_SEND_SIGNALED;
+	ctx->wr.next       = NULL;
+	rc = ibv_post_send(ctx->qp, &ctx->wr, &bad_wr);
+	if (rc != 0)
+		fprintf(stderr, "%s ibv_post_send failed %d!\n", __FUNCTION__, rc);
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->scq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (wc.opcode != IBV_WC_SEND)
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xcafebabe) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+	sleep(1);
+}
+
+static void pp_wait_for_start(struct pingpong_context *ctx)
+{
+	struct ibv_wc wc;
+	int ne;
+
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (!(wc.opcode & IBV_WC_RECV))
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xdeadbeef) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+	pp_post_recv(ctx);
+}
+
+static void pp_send_start(struct pingpong_context *ctx)
+{
+	struct ibv_send_wr *bad_wr;
+	int rc;
+	struct ibv_wc wc;
+	int ne;
+
+	ctx->list.addr = (uintptr_t) ctx->buf;
+	ctx->list.length = 1;
+	ctx->list.lkey = ctx->mr->lkey;
+	ctx->wr.wr_id      = 0xabbaabba;
+	ctx->wr.sg_list    = &ctx->list;
+	ctx->wr.num_sge    = 1;
+	ctx->wr.opcode     = IBV_WR_SEND;
+	ctx->wr.send_flags = IBV_SEND_SIGNALED;
+	ctx->wr.next       = NULL;
+	rc = ibv_post_send(ctx->qp, &ctx->wr, &bad_wr);
+	if (rc != 0)
+		fprintf(stderr, "%s ibv_post_send failed %d!\n", __FUNCTION__, rc);
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->scq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (wc.opcode != IBV_WC_SEND)
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xabbaabba) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+}
+
 static void usage(const char *argv0)
 {
 	printf("Usage:\n");
@@ -505,6 +968,7 @@
 	printf("  -C, --report-cycles    report times in cpu cycle units (default microseconds)\n");
 	printf("  -H, --report-histogram print out all results (default print summary only)\n");
 	printf("  -U, --report-unsorted  (implies -H) print out unsorted results (default sorted)\n");
+	printf("  -c, --cma    		 Use the RDMA CMA to setup the RDMA connection\n");
 }
 
 /*
@@ -599,6 +1063,7 @@
 	struct ibv_send_wr      *wr;
 	volatile char           *poll_buf;
 	volatile char           *post_buf;
+	int			 use_cma=0;
 
 	int                      scnt, rcnt, ccnt;
 
@@ -618,14 +1083,19 @@
 			{ .name = "report-cycles",  .has_arg = 0, .val = 'C' },
 			{ .name = "report-histogram",.has_arg = 0, .val = 'H' },
 			{ .name = "report-unsorted",.has_arg = 0, .val = 'U' },
+			{ .name = "cma",  	    .has_arg = 0, .val = 'c' },
 			{ 0 }
 		};
 
-		c = getopt_long(argc, argv, "p:d:i:s:n:t:CHU", long_options, NULL);
+		c = getopt_long(argc, argv, "cp:d:i:s:n:t:CHU", long_options, NULL);
 		if (c == -1)
 			break;
 
 		switch (c) {
+		case 'c':
+			use_cma = 1;
+			break;
+
 		case 'p':
 			port = strtol(optarg, NULL, 0);
 			if (port < 0 || port > 65535) {
@@ -697,23 +1167,62 @@
 	srand48(getpid() * time(NULL));
 	page_size = sysconf(_SC_PAGESIZE);
 
-	ib_dev = pp_find_dev(ib_devname);
-	if (!ib_dev)
-		return 7;
+	if (use_cma) {
+		int rc;
+		struct pingpong_dest my_dest;
+		
+		memset(&rem_dest, 0, sizeof(rem_dest));
 
-	ctx = pp_init_ctx(ib_dev, size, tx_depth, ib_port);
-	if (!ctx)
-		return 8;
+		if (servername)
+			ctx = pp_client_connect_cma(servername, port, size, tx_depth, &my_dest, 
+						    &rem_dest);
+		else
+			ctx = pp_server_connect_cma(port, size, tx_depth, &my_dest, &rem_dest);
+		if (!ctx) {
+			fprintf(stderr, "pp_connect_cma(%s,%d) failed!\n", servername, port);
+			return rc;
+		}
 
-	if (pp_open_port(ctx, servername, ib_port, port, &rem_dest))
-		return 9;
+		/*
+	 	 * Synch up and force the server to wait for the client to send
+		 * the first message (MPA requirement).
+		 */
+		if (servername) {
+			sleep(1);
+			pp_send_start(ctx);
+		} else {
+			pp_wait_for_start(ctx);
+		}
 
+		printf("  local address:  PSN %#06x RKey %#08x VAddr %#016Lx\n",
+				my_dest.psn, my_dest.rkey, my_dest.vaddr);
+		printf("  remote address: PSN %#06x RKey %#08x VAddr %#016Lx\n",
+				rem_dest.psn, rem_dest.rkey, rem_dest.vaddr);
+	} else {
+		ib_dev = pp_find_dev(ib_devname);
+		if (!ib_dev)
+			return 7;
+
+		ctx = pp_init_ctx(ib_dev, size, tx_depth, ib_port);
+		if (!ctx)
+			return 8;
+
+		if (pp_open_port(ctx, servername, ib_port, port, &rem_dest))
+			return 9;
+	}
+
 	wr = &ctx->wr;
 	ctx->list.addr = (uintptr_t) ctx->buf;
 	ctx->list.length = ctx->size;
 	ctx->list.lkey = ctx->mr->lkey;
 	wr->wr.rdma.remote_addr = rem_dest.vaddr;
 	wr->wr.rdma.rkey = rem_dest.rkey;
+	wr->wr_id      = PINGPONG_RDMA_WRID;
+	wr->sg_list    = &ctx->list;
+	wr->num_sge    = 1;
+	wr->opcode     = IBV_WR_RDMA_WRITE;
+	wr->send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
+	wr->next       = NULL;
 
 	scnt = 0;
 	rcnt = 0;
@@ -757,9 +1266,10 @@
 		if (ccnt < iters) {
 			struct ibv_wc wc;
 			int ne;
+
 			++ccnt;
 			do {
-				ne = ibv_poll_cq(ctx->cq, 1, &wc);
+				ne = ibv_poll_cq(ctx->scq, 1, &wc);
 			} while (ne == 0);
 
 			if (ne < 0) {
@@ -778,6 +1288,12 @@
 		}
 	}
 
+	if (use_cma) {
+		pp_send_done(ctx);
+		pp_wait_for_done(ctx);
+		pp_close_cma(ctx, servername);
+	} 
+
 	print_report(&report, iters, tstamp);
 	return 0;
 }
Index: rdma_bw.c
===================================================================
--- rdma_bw.c	(revision 7050)
+++ rdma_bw.c	(working copy)
@@ -53,6 +53,7 @@
 #include <time.h>
 
 #include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
 
 #include "get_clock.h"
 
@@ -64,13 +65,16 @@
 	struct ibv_context *context;
 	struct ibv_pd      *pd;
 	struct ibv_mr      *mr;
-	struct ibv_cq      *cq;
+	struct ibv_cq      *rcq;
+	struct ibv_cq      *scq;
+	struct ibv_comp_channel *ch;
 	struct ibv_qp      *qp;
 	void               *buf;
 	unsigned            size;
 	int                 tx_depth;
 	struct ibv_sge      list;
 	struct ibv_send_wr  wr;
+	struct rdma_cm_id  *cm_id;
 };
 
 struct pingpong_dest {
@@ -308,16 +312,27 @@
 		return NULL;
 	}
 
-	ctx->cq = ibv_create_cq(ctx->context, tx_depth, NULL, NULL, 0);
-	if (!ctx->cq) {
+	ctx->ch = ibv_create_comp_channel(ctx->context);
+	if (!ctx->ch) {
+		fprintf(stderr, "Couldn't create comp channel\n");
+		return NULL;
+	}
+	ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
+	if (!ctx->rcq) {
 		fprintf(stderr, "Couldn't create CQ\n");
 		return NULL;
 	}
 
+	ctx->scq = ibv_create_cq(ctx->context, tx_depth, ctx, ctx->ch, 0);
+	if (!ctx->scq) {
+		fprintf(stderr, "Couldn't create CQ\n");
+		return NULL;
+	}
+
 	{
 		struct ibv_qp_init_attr attr = {
-			.send_cq = ctx->cq,
-			.recv_cq = ctx->cq,
+			.send_cq = ctx->scq,
+			.recv_cq = ctx->rcq,
 			.cap     = {
 				.max_send_wr  = tx_depth,
 				/* Work around:  driver doesnt support
@@ -407,6 +422,469 @@
 	return 0;
 }
 
+/* CMA STUFF */
+
+static void pp_post_recv(struct pingpong_context *ctx)
+{
+	struct ibv_sge list;
+	struct ibv_recv_wr wr, *bad_wr;
+	int rc;
+
+	list.addr = (uintptr_t) ctx->buf;
+	list.length = 1;
+	list.lkey = ctx->mr->lkey;
+	wr.next = NULL;
+	wr.wr_id = 0xdeadbeef;
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+
+	rc = ibv_post_recv(ctx->qp, &wr, &bad_wr);
+	if (rc) {
+		perror("ibv_post_recv");
+		fprintf(stderr, "%s ibv_post_recv failed %d\n", __FUNCTION__, rc);
+	}
+}
+
+static struct pingpong_context *pp_init_cma_ctx(struct rdma_cm_id *cm_id,
+					    unsigned size,
+					    int tx_depth, int port)
+{
+	struct pingpong_context *ctx;
+
+	ctx = malloc(sizeof *ctx);
+	if (!ctx)
+		return NULL;
+
+	ctx->size     = size;
+	ctx->tx_depth = tx_depth;
+
+	ctx->buf = memalign(page_size, size * 2);
+	if (!ctx->buf) {
+		fprintf(stderr, "Couldn't allocate work buf.\n");
+		return NULL;
+	}
+
+	memset(ctx->buf, 0, size * 2);
+
+	ctx->cm_id = cm_id;
+	ctx->context = cm_id->verbs;
+	if (!ctx->context) {
+		fprintf(stderr, "%s Unbound cm_id!!\n", __FUNCTION__);
+		return NULL;
+	}
+
+	ctx->pd = ibv_alloc_pd(ctx->context);
+	if (!ctx->pd) {
+		fprintf(stderr, "Couldn't allocate PD\n");
+		return NULL;
+	}
+
+        /* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
+         * The Consumer is not allowed to assign Remote Write or Remote Atomic to
+         * a Memory Region that has not been assigned Local Write. */
+	ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
+			     IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+	if (!ctx->mr) {
+		fprintf(stderr, "Couldn't allocate MR\n");
+		return NULL;
+	}
+
+	ctx->ch = ibv_create_comp_channel(ctx->context);
+	if (!ctx->ch) {
+		fprintf(stderr, "Couldn't create comp channel\n");
+		return NULL;
+	}
+
+	ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
+	if (!ctx->rcq) {
+		fprintf(stderr, "Couldn't create RCQ\n");
+		return NULL;
+	}
+
+	ctx->scq = ibv_create_cq(ctx->context, tx_depth, ctx, ctx->ch, 0);
+	if (!ctx->scq) {
+		fprintf(stderr, "Couldn't create SCQ\n");
+		return NULL;
+	}
+
+	{
+		struct ibv_qp_init_attr attr = {
+			.send_cq = ctx->scq,
+			.recv_cq = ctx->rcq,
+			.cap     = {
+				.max_send_wr  = tx_depth,
+				.max_recv_wr  = 1,
+				.max_send_sge = 1,
+				.max_recv_sge = 1,
+				.max_inline_data = 0
+			},
+			.qp_type = IBV_QPT_RC
+		};
+		if (rdma_create_qp(ctx->cm_id, ctx->pd, &attr)) {
+			fprintf(stderr, "Couldn't create QP\n");
+			return NULL;
+		}
+		ctx->qp = ctx->cm_id->qp;
+	}
+
+	pp_post_recv(ctx);
+
+	return ctx;
+}
+
+static void pp_close_cma(struct pingpong_context *ctx, const char *servername)
+{
+	struct rdma_cm_event *event;
+	int rc;
+
+	if (servername) {
+		rc = rdma_disconnect(ctx->cm_id);
+		if (rc) {
+			perror("rdma_disconnect");
+			return;
+		}
+	}
+	
+	rdma_get_cm_event(&event);
+	if (event->event != RDMA_CM_EVENT_DISCONNECTED)
+		printf("unexpected event during disconnect %d\n", event->event);
+	rdma_ack_cm_event(event);
+	rdma_destroy_id(ctx->cm_id);
+}
+
+static struct pingpong_context *pp_server_connect_cma(unsigned short port, int size, int tx_depth,
+						      struct pingpong_dest *my_dest, 
+						      struct pingpong_dest *rem_dest)
+{
+	struct rdma_cm_id *listen_id;
+	struct rdma_cm_event *event;
+	struct rdma_conn_param conn_param;
+	int ret;
+	struct sockaddr_in sin;
+	struct rdma_cm_id *child_cm_id;
+	struct pingpong_context *ctx;
+
+	printf("%s starting server\n", __FUNCTION__);
+	ret = rdma_create_id(&listen_id, NULL);
+	if (ret) {
+		fprintf(stderr, "%s rdma_create_id failed %d\n", __FUNCTION__, ret);
+		return NULL;
+	}
+
+	sin.sin_addr.s_addr = 0;
+	sin.sin_family = PF_INET;
+	sin.sin_port = htons(port);
+	ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin);
+	if (ret) {
+		fprintf(stderr,"%s rdma_bind_addr failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_listen(listen_id, 0);
+	if (ret) {
+		fprintf(stderr,"%s rdma_listen failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+		fprintf(stderr,"%s bad event waiting for connect request %d\n", __FUNCTION__, 
+		       event->event);
+		goto err1;
+	}
+
+	if (!event->private_data || (event->private_data_len < sizeof(*rem_dest))) {
+		ret = 1;
+		fprintf(stderr,"%s bad private data len %d\n", __FUNCTION__, 
+		       event->private_data_len);
+		goto err1;
+	}
+	
+	memcpy(rem_dest, event->private_data, sizeof(*rem_dest));
+	child_cm_id = (struct rdma_cm_id *)event->id;
+	ctx = pp_init_cma_ctx(child_cm_id, size, tx_depth, port);
+
+	if (!ctx) {
+		fprintf(stderr,"%s pp_init_cma_ctx failed\n", __FUNCTION__);
+		goto err0;
+	}
+
+	my_dest->qpn = 0;
+	my_dest->psn = 0xbb;
+	my_dest->rkey = ctx->mr->rkey;
+	my_dest->vaddr = (uintptr_t)ctx->buf + ctx->size;
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 1;
+	conn_param.initiator_depth = 1;
+	conn_param.private_data = my_dest;
+	conn_param.private_data_len = sizeof(*my_dest);
+	ret = rdma_accept(ctx->cm_id, &conn_param);
+	if (ret) {
+		fprintf(stderr,"%s rdma_accept failed %d\n", __FUNCTION__, ret);
+		goto err0;
+	}
+	rdma_ack_cm_event(event);
+	ret = rdma_get_cm_event(&event);
+	if (ret) {
+		fprintf(stderr,"rdma_get_cm_event error %d\n", ret);
+		rdma_destroy_id(child_cm_id);
+		goto err2;
+	}
+	if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+		fprintf(stderr,"%s bad event waiting for established %d\n", __FUNCTION__, 
+		       event->event);
+		goto err0;
+	}
+	rdma_ack_cm_event(event);
+	fprintf(stderr,"%s connected!\n", __FUNCTION__);
+	return ctx;
+err0:
+	rdma_destroy_id(child_cm_id);
+err1:
+	rdma_ack_cm_event(event);
+err2:
+	rdma_destroy_id(listen_id);
+	fprintf(stderr,"%s NOT connected!\n", __FUNCTION__);
+	return NULL;
+}
+
+static unsigned get_dst_addr(const char *dst)
+{
+	struct addrinfo *res;
+	int ret;
+	unsigned addr;
+
+	ret = getaddrinfo(dst, NULL, NULL, &res);
+	if (ret) {
+		fprintf(stderr, "%s getaddrinfo failed - invalid hostname or IP address\n",
+			__FUNCTION__);
+		return 0;
+	}
+
+	if (res->ai_family != PF_INET) {
+		return 0;
+	}
+
+	addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
+	freeaddrinfo(res);
+	return addr;
+}
+
+static struct pingpong_context *pp_client_connect_cma(const char *servername, 
+						      unsigned short port, int size, int tx_depth, 
+						      struct pingpong_dest *my_dest, 
+						      struct pingpong_dest *rem_dest)
+{
+	struct rdma_cm_event *event;
+	struct rdma_conn_param conn_param;
+	int ret;
+	struct sockaddr_in sin;
+	struct rdma_cm_id *cm_id;
+	struct pingpong_context *ctx;
+
+	fprintf(stderr,"%s starting client\n", __FUNCTION__);
+	sin.sin_addr.s_addr = get_dst_addr(servername);
+	if (!sin.sin_addr.s_addr) {
+		return NULL;
+	}
+
+	ret = rdma_create_id(&cm_id, NULL);
+	if (ret) {
+		fprintf(stderr,"%s rdma_create_id failed %d\n", __FUNCTION__, ret);
+		return NULL;
+	}
+
+	sin.sin_family = PF_INET;
+	sin.sin_port = htons(port);
+	ret = rdma_resolve_addr(cm_id, NULL, (struct sockaddr *)&sin, 2000);
+	if (ret) {
+		fprintf(stderr,"%s rdma_resolve_addr failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
+		fprintf(stderr,"%s/%d unexpected CM event %d\n", __FUNCTION__, __LINE__,
+		       event->event);
+		goto err1;
+	}
+	rdma_ack_cm_event(event);
+
+	ret = rdma_resolve_route(cm_id, 2000);
+	if (ret) {
+		fprintf(stderr,"%s rdma_resolve_route failed %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
+		fprintf(stderr,"%s/%d unexpected CM event %d\n", __FUNCTION__, __LINE__,
+		       event->event);
+		goto err1;
+	}
+	rdma_ack_cm_event(event);
+
+	ctx = pp_init_cma_ctx(cm_id, size, tx_depth, port);
+
+	if (!ctx) {
+		fprintf(stderr,"%s pp_init_cma_ctx failed\n", __FUNCTION__);
+		goto err2;
+	}
+
+	my_dest->qpn = 0;
+	my_dest->psn = 0xaa;
+	my_dest->rkey = ctx->mr->rkey;
+	my_dest->vaddr = (uintptr_t)ctx->buf + ctx->size;
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 1;
+	conn_param.initiator_depth = 1;
+	conn_param.retry_count = 5;
+	conn_param.private_data = my_dest;
+	conn_param.private_data_len = sizeof(*my_dest);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	if (ret) {
+		fprintf(stderr,"%s rdma_connect failure %d\n", __FUNCTION__, ret);
+		goto err2;
+	}
+
+	ret = rdma_get_cm_event(&event);
+	if (ret) 
+		goto err2;
+
+	if (event->event != RDMA_CM_EVENT_ESTABLISHED) {
+		fprintf(stderr,"%s/%d unexpected CM event %d\n", __FUNCTION__, __LINE__,
+		       event->event);
+		goto err1;
+	}
+	if (!event->private_data || (event->private_data_len < sizeof(*rem_dest))) {
+		fprintf(stderr,"%s bad private data ptr %p len %d\n", __FUNCTION__,
+		       event->private_data, event->private_data_len);
+		goto err1;
+	}
+	memcpy(rem_dest, event->private_data, sizeof(*rem_dest));
+	rdma_ack_cm_event(event);
+	fprintf(stderr,"connected!\n");
+	return ctx;
+err1:
+	rdma_ack_cm_event(event);
+err2:
+	fprintf(stderr,"NOT connected!\n");
+	rdma_destroy_id(cm_id);
+	return NULL;
+}
+
+static void pp_wait_for_done(struct pingpong_context *ctx)
+{
+	struct ibv_wc wc;
+	int ne;
+
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (!(wc.opcode & IBV_WC_RECV))
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xdeadbeef) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+}
+
+static void pp_send_done(struct pingpong_context *ctx)
+{
+	struct ibv_send_wr *bad_wr;
+	int rc;
+	struct ibv_wc wc;
+	int ne;
+
+	ctx->list.addr = (uintptr_t) ctx->buf;
+	ctx->list.length = 1;
+	ctx->list.lkey = ctx->mr->lkey;
+	ctx->wr.wr_id      = 0xcafebabe;
+	ctx->wr.sg_list    = &ctx->list;
+	ctx->wr.num_sge    = 1;
+	ctx->wr.opcode     = IBV_WR_SEND;
+	ctx->wr.send_flags = IBV_SEND_SIGNALED;
+	ctx->wr.next       = NULL;
+	rc = ibv_post_send(ctx->qp, &ctx->wr, &bad_wr);
+	if (rc != 0)
+		fprintf(stderr, "%s ibv_post_send failed %d!\n", __FUNCTION__, rc);
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->scq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (wc.opcode != IBV_WC_SEND)
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xcafebabe) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+	sleep(1);
+}
+
+static void pp_wait_for_start(struct pingpong_context *ctx)
+{
+	struct ibv_wc wc;
+	int ne;
+
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->rcq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (!(wc.opcode & IBV_WC_RECV))
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xdeadbeef) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+	pp_post_recv(ctx);
+}
+
+static void pp_send_start(struct pingpong_context *ctx)
+{
+	struct ibv_send_wr *bad_wr;
+	int rc;
+	struct ibv_wc wc;
+	int ne;
+
+	ctx->list.addr = (uintptr_t) ctx->buf;
+	ctx->list.length = 1;
+	ctx->list.lkey = ctx->mr->lkey;
+	ctx->wr.wr_id      = 0xabbaabba;
+	ctx->wr.sg_list    = &ctx->list;
+	ctx->wr.num_sge    = 1;
+	ctx->wr.opcode     = IBV_WR_SEND;
+	ctx->wr.send_flags = IBV_SEND_SIGNALED;
+	ctx->wr.next       = NULL;
+	rc = ibv_post_send(ctx->qp, &ctx->wr, &bad_wr);
+	if (rc != 0)
+		fprintf(stderr, "%s ibv_post_send failed %d!\n", __FUNCTION__, rc);
+	do {
+		usleep(1000);
+		ne = ibv_poll_cq(ctx->scq, 1, &wc);
+	} while (ne == 0);
+
+	if (wc.status) 
+		fprintf(stderr, "%s bad wc status %d\n", __FUNCTION__, wc.status);
+	if (wc.opcode != IBV_WC_SEND)
+		fprintf(stderr, "%s bad wc opcode %d\n", __FUNCTION__, wc.opcode);
+	if (wc.wr_id != 0xabbaabba) 
+		fprintf(stderr, "%s bad wc wr_id 0x%x\n", __FUNCTION__, (int)wc.wr_id);
+	printf("start sent\n");
+}
+
 static void usage(const char *argv0)
 {
 	printf("Usage:\n");
@@ -421,6 +899,7 @@
 	printf("  -t, --tx-depth=<dep>   size of tx queue (default 100)\n");
 	printf("  -n, --iters=<iters>    number of exchanges (at least 2, default 1000)\n");
 	printf("  -b, --bidirectional    measure bidirectional bandwidth (default unidirectional)\n");
+	printf("  -c, --cma    		 Use the RDMA CMA to setup the RDMA connection\n");
 }
 
 static void print_report(unsigned int iters, unsigned size, int duplex,
@@ -438,14 +917,15 @@
 
 	/* Find the peak bandwidth */
 	for (i = 0; i < iters; ++i)
-		for (j = i; j < iters; ++j) {
-			t = (tcompleted[j] - tposted[i]) / (j - i + 1);
-			if (t < opt_delta) {
-				opt_delta  = t;
-				opt_posted = i;
-				opt_completed = j;
+		if ((i+200) < iters)
+			for (j = i; j < i+200; ++j) {
+				t = (tcompleted[j] - tposted[i]) / (j - i + 1);
+				if (t < opt_delta) {
+					opt_delta  = t;
+					opt_posted = i;
+					opt_completed = j;
+				}
 			}
-		}
 
 	cycles_to_units = get_cpu_mhz() * 1000000;
 
@@ -486,10 +966,16 @@
 	int			 sockfd;
 	int                      duplex = 0;
 	struct ibv_qp		*qp;
+	int			 use_cma=0;
+	int			 poll = 0;
+	cycles_t		*tposted;
+	cycles_t		*tcompleted;
+	struct ibv_wc 		 wc;
+	int 			 ne;
+	struct ibv_cq 		*ev_cq;
+	void 			*ev_ctx;
+	int			 blocks = 0;
 
-	cycles_t	*tposted;
-	cycles_t	*tcompleted;
-
 	/* Parameter parsing. */
 	while (1) {
 		int c;
@@ -502,14 +988,24 @@
 			{ .name = "iters",          .has_arg = 1, .val = 'n' },
 			{ .name = "tx-depth",       .has_arg = 1, .val = 't' },
 			{ .name = "bidirectional",  .has_arg = 0, .val = 'b' },
+			{ .name = "cma",  	    .has_arg = 0, .val = 'c' },
+			{ .name = "poll", 	    .has_arg = 0, .val = 'P' },
 			{ 0 }
 		};
 
-		c = getopt_long(argc, argv, "p:d:i:s:n:t:b", long_options, NULL);
+		c = getopt_long(argc, argv, "p:d:i:s:n:t:bcP", long_options, NULL);
 		if (c == -1)
 			break;
 
 		switch (c) {
+		case 'c':
+			use_cma = 1;
+			break;
+
+		case 'P':
+			poll = 1;
+			break;
+
 		case 'p':
 			port = strtol(optarg, NULL, 0);
 			if (port < 0 || port > 65535) {
@@ -552,15 +1048,6 @@
 
 			break;
 
-		case 'l':
-			tx_depth = strtol(optarg, NULL, 0);
-			if (tx_depth < 1) {
-				usage(argv[0]);
-				return 1;
-			}
-
-			break;
-
 		case 'b':
 			duplex = 1;
 			break;
@@ -585,82 +1072,125 @@
 
 	page_size = sysconf(_SC_PAGESIZE);
 
-	dev_list = ibv_get_device_list(NULL);
-
-	if (!ib_devname) {
-		ib_dev = dev_list[0];
-		if (!ib_dev) {
-			fprintf(stderr, "No IB devices found\n");
+	if (use_cma) {
+		int rc;
+		rem_dest = malloc(sizeof *rem_dest);
+		if (!rem_dest) {
+			fprintf(stderr,"%s cannot malloc rem_dest!\n", __FUNCTION__);
 			return 1;
 		}
+
+		memset(rem_dest, 0, sizeof(*rem_dest));
+
+		if (servername)
+			ctx = pp_client_connect_cma(servername, port, size, tx_depth, &my_dest, 
+						    rem_dest);
+		else
+			ctx = pp_server_connect_cma(port, size, tx_depth, &my_dest, rem_dest);
+		if (!ctx) {
+			fprintf(stderr, "pp_connect_cma(%s,%d) failed!\n", servername, port);
+			return rc;
+		}
+
+		/*
+	 	 * Synch up and force the server to wait for the client to send
+		 * the first message (MPA requirement).
+		 */
+		if (servername) {
+			sleep(1);
+			pp_send_start(ctx);
+		} else {
+			pp_wait_for_start(ctx);
+		}
+
+		printf("  local address:  PSN %#06x RKey %#08x VAddr %#016Lx\n",
+				my_dest.psn, my_dest.rkey, my_dest.vaddr);
+		printf("  remote address: PSN %#06x RKey %#08x VAddr %#016Lx\n",
+				rem_dest->psn, rem_dest->rkey, rem_dest->vaddr);
 	} else {
-		for (; (ib_dev = *dev_list); ++dev_list)
-			if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
-				break;
-		if (!ib_dev) {
-			fprintf(stderr, "IB device %s not found\n", ib_devname);
-			return 1;
+		dev_list = ibv_get_device_list(NULL);
+
+		if (!ib_devname) {
+			ib_dev = dev_list[0];
+			if (!ib_dev) {
+				fprintf(stderr, "No IB devices found\n");
+				return 1;
+			}
+		} else {
+			for (; (ib_dev = *dev_list); ++dev_list)
+				if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
+					break;
+			if (!ib_dev) {
+				fprintf(stderr, "IB device %s not found\n", ib_devname);
+				return 1;
+			}
 		}
-	}
 
-	ctx = pp_init_ctx(ib_dev, size, tx_depth, ib_port);
-	if (!ctx)
-		return 1;
+		ctx = pp_init_ctx(ib_dev, size, tx_depth, ib_port);
+		if (!ctx)
+			return 1;
 
-	/* Create connection between client and server.
-	 * We do it by exchanging data over a TCP socket connection. */
+		/* Create connection between client and server.
+		 * We do it by exchanging data over a TCP socket connection. */
 
-	my_dest.lid = pp_get_local_lid(ctx, ib_port);
-	my_dest.qpn = ctx->qp->qp_num;
-	my_dest.psn = lrand48() & 0xffffff;
-	if (!my_dest.lid) {
-		fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n");
-		return 1;
-	}
-	my_dest.rkey = ctx->mr->rkey;
-	my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
+		my_dest.lid = pp_get_local_lid(ctx, ib_port);
+		my_dest.qpn = ctx->qp->qp_num;
+		my_dest.psn = lrand48() & 0xffffff;
+		if (!my_dest.lid) {
+			fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n");
+			return 1;
+		}
+		my_dest.rkey = ctx->mr->rkey;
+		my_dest.vaddr = (uintptr_t)ctx->buf + ctx->size;
 
-	printf("  local address:  LID %#04x, QPN %#06x, PSN %#06x "
-			"RKey %#08x VAddr %#016Lx\n",
-			my_dest.lid, my_dest.qpn, my_dest.psn,
-			my_dest.rkey, my_dest.vaddr);
+		printf("  local address:  LID %#04x, QPN %#06x, PSN %#06x "
+				"RKey %#08x VAddr %#016Lx\n",
+				my_dest.lid, my_dest.qpn, my_dest.psn,
+				my_dest.rkey, my_dest.vaddr);
 
-	if (servername) {
-		sockfd = pp_client_connect(servername, port);
-		if (sockfd < 0)
+		if (servername) {
+			sockfd = pp_client_connect(servername, port);
+			if (sockfd < 0)
+				return 1;
+			rem_dest = pp_client_exch_dest(sockfd, &my_dest);
+		} else {
+			sockfd = pp_server_connect(port);
+			if (sockfd < 0)
+				return 1;
+			rem_dest = pp_server_exch_dest(sockfd, &my_dest);
+		}
+
+		if (!rem_dest)
 			return 1;
-		rem_dest = pp_client_exch_dest(sockfd, &my_dest);
-	} else {
-		sockfd = pp_server_connect(port);
-		if (sockfd < 0)
-			return 1;
-		rem_dest = pp_server_exch_dest(sockfd, &my_dest);
-	}
 
-	if (!rem_dest)
-		return 1;
+		printf("  remote address: LID %#04x, QPN %#06x, PSN %#06x, "
+				"RKey %#08x VAddr %#016Lx\n",
+				rem_dest->lid, rem_dest->qpn, rem_dest->psn,
+				rem_dest->rkey, rem_dest->vaddr);
 
-	printf("  remote address: LID %#04x, QPN %#06x, PSN %#06x, "
-			"RKey %#08x VAddr %#016Lx\n",
-			rem_dest->lid, rem_dest->qpn, rem_dest->psn,
-			rem_dest->rkey, rem_dest->vaddr);
+		if (pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest))
+			return 1;
 
-	if (pp_connect_ctx(ctx, ib_port, my_dest.psn, rem_dest))
-		return 1;
-
-	/* An additional handshake is required *after* moving qp to RTR.
-           Arbitrarily reuse exch_dest for this purpose. */
-	if (servername) {
-		rem_dest = pp_client_exch_dest(sockfd, &my_dest);
-	} else {
-		rem_dest = pp_server_exch_dest(sockfd, &my_dest);
+		/* An additional handshake is required *after* moving qp to RTR.
+		   Arbitrarily reuse exch_dest for this purpose. */
+		if (servername) {
+			rem_dest = pp_client_exch_dest(sockfd, &my_dest);
+		} else {
+			rem_dest = pp_server_exch_dest(sockfd, &my_dest);
+		}
 	}
 
 	/* For half duplex tests, server just waits for client to exit */
 	if (!servername && !duplex) {
-		rem_dest = pp_server_exch_dest(sockfd, &my_dest);
-		write(sockfd, "done", sizeof "done");
-		close(sockfd);
+		if (use_cma) {
+			pp_wait_for_done(ctx);
+			pp_send_done(ctx);
+			pp_close_cma(ctx, servername);
+		} else {
+			rem_dest = pp_server_exch_dest(sockfd, &my_dest);
+			write(sockfd, "done", sizeof "done");
+			close(sockfd);
+		}
 		return 0;
 	}
 
@@ -695,6 +1225,11 @@
 		return 1;
 	}
 
+	if (!poll && ibv_req_notify_cq(ctx->scq, 0)) {
+		fprintf(stderr, "ibv_req_notify failed!\n");
+		return 1;
+	}
+
 	/* Done with setup. Start the test. */
 
 	while (scnt < iters || ccnt < iters) {
@@ -712,10 +1247,22 @@
 		}
 
 		if (ccnt < iters) {
-			struct ibv_wc wc;
-			int ne;
+
 			do {
-				ne = ibv_poll_cq(ctx->cq, 1, &wc);
+				ne = ibv_poll_cq(ctx->scq, 1, &wc);
+				if (!poll && ne == 0) {
+					blocks++;
+					if (ibv_get_cq_event(ctx->ch, &ev_cq, &ev_ctx)) {
+						fprintf(stderr, "Failed to get cq event!\n");
+						return 1;
+					}
+					if (ev_cq != ctx->scq) {
+						fprintf(stderr, "Unkown CQ!\n");
+						return 1;
+					}
+					ibv_ack_cq_events(ctx->scq, 1);
+					ibv_req_notify_cq(ctx->scq, 0);
+				}
 			} while (ne == 0);
 
 			tcompleted[ccnt] = get_cycles();
@@ -737,15 +1284,20 @@
 		}
 	}
 
-	if (servername) {
-		rem_dest = pp_client_exch_dest(sockfd, &my_dest);
+	if (use_cma) {
+		pp_send_done(ctx);
+		pp_wait_for_done(ctx);
+		pp_close_cma(ctx, servername);
 	} else {
-		rem_dest = pp_server_exch_dest(sockfd, &my_dest);
+		if (servername) {
+			rem_dest = pp_client_exch_dest(sockfd, &my_dest);
+		} else {
+			rem_dest = pp_server_exch_dest(sockfd, &my_dest);
+		}
+		write(sockfd, "done", sizeof "done");
+		close(sockfd);
 	}
 
-	write(sockfd, "done", sizeof "done");
-	close(sockfd);
-
 	print_report(iters, size, duplex, tposted, tcompleted);
 
 	free(tposted);
Index: Makefile
===================================================================
--- Makefile	(revision 7050)
+++ Makefile	(working copy)
@@ -10,7 +10,7 @@
 LOADLIBES += 
 LDFLAGS +=
 
-${TESTS}: LOADLIBES += -libverbs
+${TESTS}: LOADLIBES += -libverbs -lrdmacm
 
 ${TESTS} ${UTILS}: %: %.c ${EXTRA_FILES} ${EXTRA_HEADERS}
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $< ${EXTRA_FILES} $(LOADLIBES) $(LDLIBS) -o $@




More information about the general mailing list