[ofa-general] [PATCH v5] IB/mlx4: shrinking WQE

Wed Oct 10 09:09:48 PDT 2007

Can you provide sample code to use these new features ?

--CQ

> -----Original Message-----
> From: general-bounces at lists.openfabrics.org 
> [mailto:general-bounces at lists.openfabrics.org] On Behalf Of 
> Jack Morgenstein
> Sent: Wednesday, October 10, 2007 10:44 AM
> To: general at lists.openfabrics.org
> Cc: Roland Dreier
> Subject: [ofa-general] [PATCH v5] IB/mlx4: shrinking WQE
> 
> commit c0aa89f0b295dd0c20b2ff2b1d2eca10cdc84f4b
> Author: Michael S. Tsirkin <mst at dev.mellanox.co.il>
> Date:   Thu Aug 30 15:51:40 2007 +0300
> 
>     IB/mlx4: shrinking WQE
>     
>     ConnectX supports shrinking wqe, such that a single WR can include
>     multiple units of wqe_shift.  This way, WRs can differ in 
> size, and
>     do not have to be a power of 2 in size, saving memory and 
> speeding up
>     send WR posting.  Unfortunately, if we do this wqe_index 
> field in CQE
>     can't be used to look up the WR ID anymore, so do this only if
>     selective signalling is off.
>     
>     Further, on 32-bit platforms, we can't use vmap to make
>     the QP buffer virtually contigious. Thus we have to use
>     constant-sized WRs to make sure a WR is always fully within
>     a single page-sized chunk.
>     
>     Finally, we use WR with NOP opcode to avoid wrap-around
>     in the middle of WR. We set NoErrorCompletion bit to avoid getting
>     completions with error for NOP WRs. Since NEC is only supported
>     starting with firmware 2.2.232, we use constant-sized WRs
>     for older firmware. And, since MLX QPs only support SEND, we use
>     constant-sized WRs in this case.
> 
>     Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>
> 
> ---
> 
> Changes since v4: fix calls to stamp_send_wqe, and stamping placement
>                   inside post_nop_wqe.
> Found by regression, fixed by Jack Morgenstein. 
> Changes since v3: fix nop formatting.
> Found by Eli Cohen.
> Changes since v2: fix memory leak in mlx4_buf_alloc.
> Found by internal code review.
> changes since v1: add missing patch hunks
> 
> Index: infiniband/drivers/infiniband/hw/mlx4/cq.c
> ===================================================================
> --- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c	
> 2007-10-10 17:12:05.184757000 +0200
> +++ infiniband/drivers/infiniband/hw/mlx4/cq.c	
> 2007-10-10 17:23:02.337140000 +0200
> @@ -331,6 +331,12 @@ static int mlx4_ib_poll_one(struct mlx4_
>  	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
>  		MLX4_CQE_OPCODE_ERROR;
>  
> +	if (unlikely((cqe->owner_sr_opcode & 
> MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
> +		     is_send)) {
> +		printk(KERN_WARNING "Completion for NOP opcode 
> detected!\n");
> +		return -EINVAL;
> +	}
> +
>  	if (!*cur_qp ||
>  	    (be32_to_cpu(cqe->my_qpn) & 0xffffff) != 
> (*cur_qp)->mqp.qpn) {
>  		/*
> @@ -353,8 +359,10 @@ static int mlx4_ib_poll_one(struct mlx4_
>  
>  	if (is_send) {
>  		wq = &(*cur_qp)->sq;
> -		wqe_ctr = be16_to_cpu(cqe->wqe_index);
> -		wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
> +		if (!(*cur_qp)->sq_signal_bits) {
> +			wqe_ctr = be16_to_cpu(cqe->wqe_index);
> +			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
> +		}
>  		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
>  		++wq->tail;
>  	} else if ((*cur_qp)->ibqp.srq) {
> Index: infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h
> ===================================================================
> --- infiniband.orig/drivers/infiniband/hw/mlx4/mlx4_ib.h	
> 2007-10-10 17:21:17.844882000 +0200
> +++ infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h	
> 2007-10-10 17:23:02.341138000 +0200
> @@ -120,6 +120,8 @@ struct mlx4_ib_qp {
>  
>  	u32			doorbell_qpn;
>  	__be32			sq_signal_bits;
> +	unsigned		sq_next_wqe;
> +	int			sq_max_wqes_per_wr;
>  	int			sq_spare_wqes;
>  	struct mlx4_ib_wq	sq;
>  
> Index: infiniband/drivers/infiniband/hw/mlx4/qp.c
> ===================================================================
> --- infiniband.orig/drivers/infiniband/hw/mlx4/qp.c	
> 2007-10-10 17:21:17.853882000 +0200
> +++ infiniband/drivers/infiniband/hw/mlx4/qp.c	
> 2007-10-10 17:23:02.350137000 +0200
> @@ -30,6 +30,7 @@
>   * SOFTWARE.
>   */
>  
> +#include <linux/log2.h>
>  #include <rdma/ib_cache.h>
>  #include <rdma/ib_pack.h>
>  
> @@ -92,7 +93,7 @@ static int is_qp0(struct mlx4_ib_dev *de
>  
>  static void *get_wqe(struct mlx4_ib_qp *qp, int offset)  {
> -	if (qp->buf.nbufs == 1)
> +	if (BITS_PER_LONG == 64 || qp->buf.nbufs == 1)
>  		return qp->buf.u.direct.buf + offset;
>  	else
>  		return qp->buf.u.page_list[offset >> 
> PAGE_SHIFT].buf + @@ -111,16 +112,88 @@ static void 
> *get_send_wqe(struct mlx4_ib
>  
>  /*
>   * Stamp a SQ WQE so that it is invalid if prefetched by marking the
> - * first four bytes of every 64 byte chunk with 0xffffffff, 
> except for
> - * the very first chunk of the WQE.
> + * first four bytes of every 64 byte chunk with
> + * 0x7FFFFFF | (invalid_ownership_value << 31).
> + *
> + * When max WR is than or equal to the WQE size,
> + * as an optimization, we can stamp WQE with 0xffffffff,
> + * and skip the very first chunk of the WQE.
>   */
> -static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
> +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
>  {
> -	u32 *wqe = get_send_wqe(qp, n);
> +	u32 *wqe;
>  	int i;
> +	int s;
> +	int ind;
> +	void *buf;
> +	__be32 stamp;
> +
> +	s = roundup(size, 1 << qp->sq.wqe_shift);
> +	if (qp->sq_max_wqes_per_wr > 1) {
> +		for (i = 0; i < s; i += 64) {
> +			ind = (i >> qp->sq.wqe_shift) + n;
> +			stamp = ind & qp->sq.wqe_cnt ?  
> cpu_to_be32(0x7fffffff) :
> +							
> cpu_to_be32(0xffffffff);
> +			buf = get_send_wqe(qp, ind & 
> (qp->sq.wqe_cnt - 1));
> +			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
> +			*wqe = stamp;
> +		}
> +	} else {
> +		buf = get_send_wqe(qp, n);
> +		for (i = 64; i < s; i += 64) {
> +			wqe = buf + i;
> +			*wqe = 0xffffffff;
> +		}
> +	}
> +}
> +
> +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) {
> +	struct mlx4_wqe_ctrl_seg *ctrl;
> +	struct mlx4_wqe_inline_seg *inl;
> +	void *wqe;
> +	int s;
> +
> +	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
> +	s = sizeof(struct mlx4_wqe_ctrl_seg);
> +
> +	if (qp->ibqp.qp_type == IB_QPT_UD) {
> +		struct mlx4_wqe_datagram_seg *dgram = wqe + 
> sizeof *ctrl;
> +		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
> +		memset(dgram, 0, sizeof *dgram);
> +		av->port_pd = cpu_to_be32((qp->port << 24) | 
> to_mpd(qp->ibqp.pd)->pdn);
> +		s += sizeof(struct mlx4_wqe_datagram_seg);
> +	}
> +
> +	/* Pad the remainder of the WQE with an inline data segment. */
> +	if (size > s) {
> +		inl = wqe + s;
> +		inl->byte_count = cpu_to_be32(1 << 31 | (size - 
> s - sizeof *inl));
> +	}
> +	ctrl->srcrb_flags = 0;
> +	ctrl->fence_size = size / 16;
> +	/*
> +	 * Make sure descriptor is fully written before
> +	 * setting ownership bit (because HW can start
> +	 * executing as soon as we do).
> +	 */
> +	wmb();
>  
> -	for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
> -		wqe[i] = 0xffffffff;
> +	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | 
> MLX4_WQE_CTRL_NEC) |
> +		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
> +
> +	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); }
> +
> +/* Post NOP WQE to prevent wrap-around in the middle of WR */ static 
> +inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) {
> +	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
> +	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
> +		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
> +		ind += s;
> +	}
> +	return ind;
>  }
>  
>  static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum 
> mlx4_event type) @@ -237,6 +310,8 @@ static int 
> set_rq_size(struct mlx4_ib_de  static int 
> set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
>  			      enum ib_qp_type type, struct 
> mlx4_ib_qp *qp)  {
> +	int s;
> +
>  	/* Sanity check SQ size before proceeding */
>  	if (cap->max_send_wr	 > dev->dev->caps.max_wqes  ||
>  	    cap->max_send_sge	 > dev->dev->caps.max_sq_sg ||
> @@ -252,20 +327,69 @@ static int set_kernel_sq_size(struct mlx
>  	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
>  		return -EINVAL;
>  
> -	qp->sq.wqe_shift = 
> ilog2(roundup_pow_of_two(max(cap->max_send_sge *
> -							sizeof 
> (struct mlx4_wqe_data_seg),
> -							
> cap->max_inline_data +
> -							sizeof 
> (struct mlx4_wqe_inline_seg)) +
> -						    
send_wqe_overhead(type)));
> -	qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - 
> send_wqe_overhead(type)) /
> -		sizeof (struct mlx4_wqe_data_seg);
> +	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
> +		cap->max_inline_data + sizeof (struct 
> mlx4_wqe_inline_seg)) +
> +		send_wqe_overhead(type);
>  
>  	/*
> -	 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
> -	 * allow HW to prefetch.
> +	 * Hermon supports shrinking wqe, such that a single WR 
> can include
> +	 * multiple units of wqe_shift.  This way, WRs can 
> differ in size, and
> +	 * do not have to be a power of 2 in size, saving 
> memory and speeding up
> +	 * send WR posting.  Unfortunately, if we do this 
> wqe_index field in CQE
> +	 * can't be used to look up the WR ID anymore, so do 
> this only if
> +	 * selective signalling is off.
> +	 *
> +	 * Further, on 32-bit platforms, we can't use vmap to make
> +	 * the QP buffer virtually contigious. Thus we have to use
> +	 * constant-sized WRs to make sure a WR is always fully within
> +	 * a single page-sized chunk.
> +	 *
> +	 * Finally, we use NOP opcode to avoid wrap-around in 
> the middle of WR.
> +	 * We set NEC bit to avoid getting completions with 
> error for NOP WRs.
> +	 * Since NEC is only supported starting with firmware 2.2.232,
> +	 * we use constant-sized WRs for older firmware.
> +	 *
> +	 * And, since MLX QPs only support SEND, we use 
> constant-sized WRs in this
> +	 * case.
> +	 *
> +	 * We look for the smallest value of wqe_shift such 
> that the resulting
> +	 * number of wqes does not exceed device capabilities.
> +	 *
> +	 * We set WQE size to at least 64 bytes, this way 
> stamping invalidates each WQE.
>  	 */
> -	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
> -	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + 
> qp->sq_spare_wqes);
> +	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
> +	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
> +	    type != IB_QPT_SMI && type != IB_QPT_GSI)
> +		qp->sq.wqe_shift = ilog2(64);
> +	else
> +		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
> +
> +	for (;;) {
> +		if (1 << qp->sq.wqe_shift > 
> dev->dev->caps.max_sq_desc_sz)
> +			return -EINVAL;
> +
> +		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1 << 
> qp->sq.wqe_shift);
> +
> +		/*
> +		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
> +		 * allow HW to prefetch.
> +		 */
> +		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) 
> + qp->sq_max_wqes_per_wr;
> +		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
> +						    
> qp->sq_max_wqes_per_wr +
> +						    qp->sq_spare_wqes);
> +
> +		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
> +			break;
> +
> +		if (qp->sq_max_wqes_per_wr <= 1)
> +			return -EINVAL;
> +
> +		++qp->sq.wqe_shift;
> +	}
> +
> +	qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
> +			 send_wqe_overhead(type)) / sizeof 
> (struct mlx4_wqe_data_seg);
>  
>  	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
>  		(qp->sq.wqe_cnt << qp->sq.wqe_shift); @@ -277,7 
> +401,8 @@ static int set_kernel_sq_size(struct mlx
>  		qp->sq.offset = 0;
>  	}
>  
> -	cap->max_send_wr  = qp->sq.max_post = qp->sq.wqe_cnt - 
> qp->sq_spare_wqes;
> +	cap->max_send_wr  = qp->sq.max_post =
> +		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / 
> qp->sq_max_wqes_per_wr;
>  	cap->max_send_sge = qp->sq.max_gs;
>  	/* We don't support inline sends for kernel QPs (yet) */
>  	cap->max_inline_data = 0;
> @@ -315,6 +440,12 @@ static int create_qp_common(struct mlx4_
>  	qp->rq.tail	    = 0;
>  	qp->sq.head	    = 0;
>  	qp->sq.tail	    = 0;
> +	qp->sq_next_wqe     = 0;
> +
> +	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
> +		qp->sq_signal_bits = 
> cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
> +	else
> +		qp->sq_signal_bits = 0;
>  
>  	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, 
> !!init_attr->srq, qp);
>  	if (err)
> @@ -405,11 +536,6 @@ static int create_qp_common(struct mlx4_
>  	 */
>  	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
>  
> -	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
> -		qp->sq_signal_bits = 
> cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
> -	else
> -		qp->sq_signal_bits = 0;
> -
>  	qp->mqp.event = mlx4_ib_qp_event;
>  
>  	return 0;
> @@ -904,7 +1030,7 @@ static int __mlx4_ib_modify_qp(struct ib
>  			ctrl = get_send_wqe(qp, i);
>  			ctrl->owner_opcode = cpu_to_be32(1 << 31);
>  
> -			stamp_send_wqe(qp, i);
> +			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
>  		}
>  	}
>  
> @@ -1266,13 +1392,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
>  	unsigned long flags;
>  	int nreq;
>  	int err = 0;
> -	int ind;
> -	int size;
> +	unsigned ind;
> +	int uninitialized_var(stamp);
> +	int uninitialized_var(size);
>  	int i;
>  
>  	spin_lock_irqsave(&qp->rq.lock, flags);
>  
> -	ind = qp->sq.head;
> +	ind = qp->sq_next_wqe;
>  
>  	for (nreq = 0; wr; ++nreq, wr = wr->next) {
>  		if (mlx4_wq_overflow(&qp->sq, nreq, 
> qp->ibqp.send_cq)) { @@ -1288,7 +1415,7 @@ int 
> mlx4_ib_post_send(struct ib_qp *ibqp
>  		}
>  
>  		ctrl = wqe = get_send_wqe(qp, ind & 
> (qp->sq.wqe_cnt - 1));
> -		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
> +		qp->sq.wrid[(qp->sq.head + nreq) & 
> (qp->sq.wqe_cnt - 1)] = wr->wr_id;
>  
>  		ctrl->srcrb_flags =
>  			(wr->send_flags & IB_SEND_SIGNALED ?
> @@ -1401,16 +1528,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
>  		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
>  			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 
> << 31) : 0);
>  
> +		stamp = ind + qp->sq_spare_wqes;
> +		ind += DIV_ROUND_UP(size * 16, 1 << qp->sq.wqe_shift);
> +
>  		/*
>  		 * We can improve latency by not stamping the last
>  		 * send queue WQE until after ringing the doorbell, so
>  		 * only stamp here if there are still more WQEs to post.
> +		 *
> +		 * Same optimization applies to padding with NOP wqe
> +		 * in case of WQE shrinking (used to prevent wrap-around
> +		 * in the middle of WR).
>  		 */
> -		if (wr->next)
> -			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
> -				       (qp->sq.wqe_cnt - 1));
> +		if (wr->next) {
> +			stamp_send_wqe(qp, stamp, size * 16);
> +			ind = pad_wraparound(qp, ind);
> +		}
>  
> -		++ind;
>  	}
>  
>  out:
> @@ -1432,8 +1566,10 @@ out:
>  		 */
>  		mmiowb();
>  
> -		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
> -			       (qp->sq.wqe_cnt - 1));
> +		stamp_send_wqe(qp, stamp, size * 16);
> +
> +		ind = pad_wraparound(qp, ind);
> +		qp->sq_next_wqe = ind;
>  	}
>  
>  	spin_unlock_irqrestore(&qp->rq.lock, flags);
> Index: infiniband/drivers/net/mlx4/alloc.c
> ===================================================================
> --- infiniband.orig/drivers/net/mlx4/alloc.c	2007-10-10 
> 17:12:12.259502000 +0200
> +++ infiniband/drivers/net/mlx4/alloc.c	2007-10-10 
> 17:23:02.356137000 +0200
> @@ -151,6 +151,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev,
>  
>  			memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
>  		}
> +
> +		if (BITS_PER_LONG == 64) {
> +			struct page **pages;
> +			pages = kmalloc(sizeof *pages * 
> buf->nbufs, GFP_KERNEL);
> +			if (!pages)
> +				goto err_free;
> +			for (i = 0; i < buf->nbufs; ++i)
> +				pages[i] = 
> virt_to_page(buf->u.page_list[i].buf);
> +			buf->u.direct.buf = vmap(pages, 
> buf->nbufs, VM_MAP, PAGE_KERNEL);
> +			kfree(pages);
> +			if (!buf->u.direct.buf)
> +				goto err_free;
> +		}
>  	}
>  
>  	return 0;
> @@ -170,6 +183,9 @@ void mlx4_buf_free(struct mlx4_dev *dev,
>  		dma_free_coherent(&dev->pdev->dev, size, 
> buf->u.direct.buf,
>  				  buf->u.direct.map);
>  	else {
> +		if (BITS_PER_LONG == 64)
> +			vunmap(buf->u.direct.buf);
> +
>  		for (i = 0; i < buf->nbufs; ++i)
>  			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
>  					  buf->u.page_list[i].buf,
> Index: infiniband/include/linux/mlx4/device.h
> ===================================================================
> --- infiniband.orig/include/linux/mlx4/device.h	
> 2007-10-10 17:21:17.954882000 +0200
> +++ infiniband/include/linux/mlx4/device.h	2007-10-10 
> 17:23:02.363137000 +0200
> @@ -133,6 +133,11 @@ enum {
>  	MLX4_STAT_RATE_OFFSET	= 5
>  };
>  
> +static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) {
> +	return (major << 32) | (minor << 16) | subminor; }
> +
>  struct mlx4_caps {
>  	u64			fw_ver;
>  	int			num_ports;
> @@ -189,7 +194,7 @@ struct mlx4_buf_list {  };
>  
>  struct mlx4_buf {
> -	union {
> +	struct {
>  		struct mlx4_buf_list	direct;
>  		struct mlx4_buf_list   *page_list;
>  	} u;
> Index: infiniband/include/linux/mlx4/qp.h
> ===================================================================
> --- infiniband.orig/include/linux/mlx4/qp.h	2007-10-10 
> 17:12:38.460566000 +0200
> +++ infiniband/include/linux/mlx4/qp.h	2007-10-10 
> 17:23:02.366140000 +0200
> @@ -154,7 +154,11 @@ struct mlx4_qp_context {
>  	u32			reserved5[10];
>  };
>  
> +/* Which firmware version adds support for NEC 
> (NoErrorCompletion) bit 
> +*/ #define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
> +
>  enum {
> +	MLX4_WQE_CTRL_NEC	= 1 << 29,
>  	MLX4_WQE_CTRL_FENCE	= 1 << 6,
>  	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
>  	MLX4_WQE_CTRL_SOLICITED	= 1 << 1,
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
> 
> To unsubscribe, please visit 
> http://openib.org/mailman/listinfo/openib-general
>