[openib-general] [PATCH] IPoIB CM Experimental support
Michael S. Tsirkin
mst at mellanox.co.il
Tue Dec 5 08:19:44 PST 2006
The following patch adds experimental support for IPoIB connected mode.
The idea is to increase performance by increasing the MTU
from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD.
With this code, I'm able to get 800MByte/sec or more with netperf
without options on a Mellanox 4x back-to-back DDR system.
Please review.
I labeled CM support as experimental, although its been very stable for me,
mostly because there are still some things to be addressed before it's as usable
as IPoIB UD. I am very interested in getting this code in shape for merging as
early as possible, as opposed to maintaining it out of tree until it's fully
mature, and I tried to split the CM code in a separate file to make this
feasible.
Let me know whether this was a good idea, or whether more needs to be done
in this direction.
Note that the connected mode support adds very little overhead when not activated
at run time, and zero data-path overhead when not activated at compile time.
Here's a short description of what the patch does:
a. The code's here:
git://staging.openfabrics.org/~mst/linux-2.6/.git ipoib_cm_branch
This is based on 2.6.19, so
~>git diff v2.6.19..ipoib_cm_branch
will show what I have done so far.
b. How to activate:
Server:
#modprobe ib_ipoib
#/sbin/ifconfig ib0 mtu 65520
#./netperf-2.4.2/src/netserver
Client:
#modprobe ib_ipoib
#/sbin/ifconfig ib0 mtu 65520
#./netperf-2.4.2/src/netperf -H 11.4.3.68 -f M
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68)
port 0 AF_INET : demo
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. MBytes/sec
87380 16384 16384 10.01 891.21
c. TODO list
1. Clean up stale connections
4. (Optional) S/G support
5. (Optional) Make CM use same CQ IPoIB uses for UD
d. Limitations
UDP multicast and UDP connections to IPoIB UD mode
currently don't work since we get packets that are too large to
send over a UD QP.
As a work around, one can now create separate interfaces
for use with CM and UD mode.
e. Some notes on code
1. SRQ is used for scalability to large cluster sizes
2. Only RC connections are used (UC does not support SRQ now)
3. Retry count is set to 0 since spec draft warns against retries
4. Each connection is used for data transfers in only 1 direction,
so each connection is either active(TX) or passive (RX).
2 sides that want to communicate create 2 connections.
5. Each active (TX) connection has a separate CQ for send completions -
this keeps the code simple without CQ resize and other tricks
I'm looking at ways to limit the path mtu
for these connections, to make it work.
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
---
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
index c75322d..7aa3a25 100644
--- a/drivers/infiniband/ulp/ipoib/Kconfig
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -8,6 +8,15 @@ config INFINIBAND_IPOIB
See Documentation/infiniband/ipoib.txt for more information
+config INFINIBAND_IPOIB_CM
+ bool "IP-over-InfiniBand Connected Mode support"
+ depends on INFINIBAND_IPOIB && EXPERIMENTAL
+ default n
+ ---help---
+ This option enables experimental support for IPoIB connected mode.
+ After enabling this option, you need to increase the interface MTU
+ with e.g. ifconfig ib0 mtu 65520 to actually create connections.
+
config INFINIBAND_IPOIB_DEBUG
bool "IP-over-InfiniBand debugging" if EMBEDDED
depends on INFINIBAND_IPOIB
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
index 8935e74..f01a24b 100644
--- a/drivers/infiniband/ulp/ipoib/Makefile
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -6,4 +6,5 @@ ib_ipoib-y := ipoib_main.o \
ipoib_verbs.o \
ipoib_vlan.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
+ib_ipoib-$(INFINIBAND_IPOIB_CM) += ipoib_cm.o
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 0b8a79d..545cdae 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -62,6 +62,9 @@ enum {
IPOIB_ENCAP_LEN = 4,
+ IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
+ IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
+
IPOIB_RX_RING_SIZE = 128,
IPOIB_TX_RING_SIZE = 64,
IPOIB_MAX_QUEUE_SIZE = 8192,
@@ -81,6 +84,7 @@ enum {
IPOIB_MCAST_RUN = 6,
IPOIB_STOP_REAPER = 7,
IPOIB_MCAST_STARTED = 8,
+ IPOIB_FLAG_NETIF_STOPPED = 9,
IPOIB_MAX_BACKOFF_SECONDS = 16,
@@ -113,6 +117,49 @@ struct ipoib_tx_buf {
DECLARE_PCI_UNMAP_ADDR(mapping)
};
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+ __be32 qpn; /* High byte MUST be ignored on receive */
+ __be32 mtu;
+};
+
+struct ipoib_cm_rx {
+ struct ib_cm_id *id;
+ struct ib_qp *qp;
+ struct list_head list;
+ struct net_device *dev;
+};
+
+struct ipoib_cm_tx {
+ struct ib_cm_id *id;
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ struct list_head list;
+ struct net_device *dev;
+ struct ipoib_neigh *neigh;
+ struct ipoib_path *path;
+ struct ipoib_tx_buf *tx_ring;
+ unsigned tx_head;
+ unsigned tx_tail;
+ unsigned long flags;
+ u32 mtu;
+ struct ib_wc ibwc[IPOIB_NUM_WC];
+};
+
+struct ipoib_cm_dev_priv {
+ struct ib_cq *cq;
+ struct ib_srq *srq;
+ struct ipoib_rx_buf *srq_ring;
+ struct ib_cm_id *id;
+ struct list_head passive_ids;
+ struct work_struct start_task;
+ struct work_struct reap_task;
+ struct list_head start_list;
+ struct list_head reap_list;
+ struct ib_wc ibwc[IPOIB_NUM_WC];
+};
+
/*
* Device private locking: tx_lock protects members used in TX fast
* path (and we use LLTX so upper layers don't do extra locking).
@@ -179,6 +226,8 @@ struct ipoib_dev_priv {
struct list_head child_intfs;
struct list_head list;
+ struct ipoib_cm_dev_priv cm;
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
struct list_head fs_list;
struct dentry *mcg_dentry;
@@ -212,6 +261,7 @@ struct ipoib_path {
struct ipoib_neigh {
struct ipoib_ah *ah;
+ struct ipoib_cm_tx *cm;
union ib_gid dgid;
struct sk_buff_head queue;
@@ -315,6 +365,93 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
void ipoib_pkey_poll(void *dev);
int ipoib_pkey_dev_delay_open(struct net_device *dev);
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+#define IPOIB_FLAGS_RC 0x80
+#define IPOIB_FLAGS_UC 0x40
+
+#define IPOIB_CM_ENABLED(ha) (ha[0] & IPOIB_FLAGS_RC)
+
+static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
+{
+ /* Simple heuristic: dev->mtu > 2K ==> connected mode */
+ return (IPOIB_CM_ENABLED(n->ha) &&
+ dev->mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+ return neigh->cm;
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct net_device *dev);
+void ipoib_cm_dev_stop(struct net_device *dev);
+int ipoib_cm_dev_init(struct net_device *dev);
+void ipoib_cm_dev_cleanup(struct net_device *dev);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+ struct ipoib_neigh *neigh);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+#else
+
+#define IPOIB_CM_ENABLED(ha) (0)
+
+static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
+
+{
+ return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+ return NULL;
+}
+
+static inline
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+ return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+ return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+ return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+ return 0;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+ return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+ struct ipoib_neigh *neigh)
+{
+ return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+ return;
+}
+
+#endif
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
void ipoib_create_debug_files(struct net_device *dev);
void ipoib_delete_debug_files(struct net_device *dev);
@@ -392,4 +529,7 @@ extern int ipoib_debug_level;
#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw)
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
+
#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 0000000..a40eb4c
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_cache.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+ "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#include "ipoib.h"
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_OP_SRQ (1ul << 30)
+
+struct ipoib_cm_id {
+ struct ib_cm_id *id;
+ int flags;
+ u32 remote_qpn;
+ u32 remote_mtu;
+};
+
+int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
+
+static int ipoib_cm_post_receive(struct net_device *dev, int id)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_sge list;
+ struct ib_recv_wr param;
+ struct ib_recv_wr *bad_wr;
+ int ret;
+
+ list.addr = priv->cm.srq_ring[id].mapping;
+ list.length = IPOIB_CM_BUF_SIZE;
+ list.lkey = priv->mr->lkey;
+
+ param.next = NULL;
+ param.wr_id = id | IPOIB_OP_SRQ;
+ param.sg_list = &list;
+ param.num_sge = 1;
+
+ ret = ib_post_srq_recv(priv->cm.srq, ¶m, &bad_wr);
+ if (unlikely(ret)) {
+ ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+ dma_unmap_single(priv->ca->dma_device,
+ priv->cm.srq_ring[id].mapping,
+ IPOIB_CM_BUF_SIZE, DMA_FROM_DEVICE);
+ dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+ priv->cm.srq_ring[id].skb = NULL;
+ }
+
+ return ret;
+}
+
+static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct sk_buff *skb;
+ dma_addr_t addr;
+
+ skb = dev_alloc_skb(IPOIB_CM_BUF_SIZE + 12);
+ if (!skb)
+ return -ENOMEM;
+
+ /*
+ * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
+ * IP header to a multiple of 16.
+ */
+ skb_reserve(skb, 12);
+
+ addr = dma_map_single(priv->ca->dma_device,
+ skb->data, IPOIB_CM_BUF_SIZE,
+ DMA_FROM_DEVICE);
+ if (unlikely(dma_mapping_error(addr))) {
+ dev_kfree_skb_any(skb);
+ return -EIO;
+ }
+
+ priv->cm.srq_ring[id].skb = skb;
+ priv->cm.srq_ring[id].mapping = addr;
+
+ return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_init_attr attr = {
+ .send_cq = priv->cm.cq, /* does not matter, we never send anything */
+ .recv_cq = priv->cm.cq,
+ .srq = priv->cm.srq,
+ .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
+ .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_RC,
+ };
+ return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_rts(struct net_device *dev,
+ struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+ return ret;
+ }
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+ return ret;
+ }
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+ return ret;
+ }
+ qp_attr.rq_psn = 0 /* FIXME */;
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
+ struct ib_qp *qp, struct ib_cm_req_event_param *req)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_cm_data data = {};
+ struct ib_cm_rep_param rep = {};
+
+ data.qpn = cpu_to_be32(priv->qp->qp_num);
+ data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+ rep.private_data = &data;
+ rep.private_data_len = sizeof data;
+ rep.flow_control = 0;
+ rep.rnr_retry_count = req->rnr_retry_count;
+ rep.target_ack_delay = 20; /* FIXME */
+ rep.srq = 1;
+ rep.qp_num = qp->qp_num;
+ rep.starting_psn = 0 /* FIXME */;
+ return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+ struct net_device *dev = cm_id->context;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_cm_rx *p;
+ unsigned long flags;
+ int ret;
+
+ ipoib_dbg(priv, "REQ arrived\n");
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ p->dev = dev;
+ p->id = cm_id;
+ p->qp = ipoib_cm_create_rx_qp(dev);
+ if (IS_ERR(p->qp)) {
+ ret = PTR_ERR(p->qp);
+ goto err_qp;
+ }
+
+ ret = ipoib_cm_modify_rx_rts(dev, cm_id, p->qp);
+ if (ret)
+ goto err_modify;
+
+ ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd);
+ if (ret) {
+ ipoib_warn(priv, "failed to send REP: %d\n", ret);
+ goto err_rep;
+ }
+
+ cm_id->context = p;
+ spin_lock_irqsave(&priv->lock, flags);
+ list_add(&p->list, &priv->cm.passive_ids);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ return 0;
+
+err_rep:
+err_modify:
+ ib_destroy_qp(p->qp);
+err_qp:
+ kfree(p);
+ return ret;
+}
+
+int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+ struct ipoib_cm_rx *p;
+ struct ipoib_dev_priv *priv;
+ unsigned long flags;
+ int ret;
+
+ switch (event->event) {
+ case IB_CM_REQ_RECEIVED:
+ return ipoib_cm_req_handler(cm_id, event);
+ case IB_CM_DREQ_RECEIVED:
+ p = cm_id->context;
+ ib_send_cm_drep(cm_id, NULL, 0);
+ /* Fall through */
+ case IB_CM_REJ_RECEIVED:
+ p = cm_id->context;
+ priv = netdev_priv(p->dev);
+ spin_lock_irqsave(&priv->lock, flags);
+ if (list_empty(&p->list))
+ ret = 0; /* Connection is going away already. */
+ else {
+ list_del(&p->list);
+ ret = -ECONNRESET;
+ }
+ spin_unlock_irqrestore(&priv->lock, flags);
+ if (ret) {
+ ib_destroy_qp(p->qp);
+ kfree(p);
+ return ret;
+ }
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned int wr_id = wc->wr_id & ~IPOIB_OP_SRQ;
+ struct sk_buff *skb;
+ dma_addr_t addr;
+
+ ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n",
+ wr_id, wc->opcode, wc->status);
+
+ if (unlikely(wr_id >= ipoib_recvq_size)) {
+ ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_recvq_size);
+ return;
+ }
+
+ skb = priv->cm.srq_ring[wr_id].skb;
+ addr = priv->cm.srq_ring[wr_id].mapping;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ ++priv->stats.rx_dropped;
+ goto repost;
+ }
+
+ /*
+ * If we can't allocate a new RX buffer, dump
+ * this packet and reuse the old buffer.
+ */
+ if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id))) {
+ ++priv->stats.rx_dropped;
+ goto repost;
+ }
+
+ ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+ wc->byte_len, wc->slid);
+
+ dma_unmap_single(priv->ca->dma_device, addr,
+ IPOIB_CM_BUF_SIZE, DMA_FROM_DEVICE);
+
+ skb_put(skb, wc->byte_len);
+
+ if (wc->slid != priv->local_lid ||
+ wc->src_qp != priv->qp->qp_num) {
+ skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+ skb->mac.raw = skb->data;
+ skb_pull(skb, IPOIB_ENCAP_LEN);
+
+ dev->last_rx = jiffies;
+ ++priv->stats.rx_packets;
+ priv->stats.rx_bytes += skb->len;
+
+ skb->dev = dev;
+ /* XXX get correct PACKET_ type here */
+ skb->pkt_type = PACKET_HOST;
+ netif_rx_ni(skb);
+ } else {
+ ipoib_dbg_data(priv, "dropping loopback packet\n");
+ dev_kfree_skb_any(skb);
+ }
+
+repost:
+ if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
+ ipoib_warn(priv, "ipoib_cm_post_receive failed "
+ "for buf %d\n", wr_id);
+}
+
+void ipoib_cm_rx_completion(struct ib_cq *cq, void *dev_ptr)
+{
+ struct net_device *dev = (struct net_device *) dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int n, i;
+
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ do {
+ n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->cm.ibwc);
+ for (i = 0; i < n; ++i)
+ ipoib_cm_handle_rx_wc(dev, priv->cm.ibwc + i);
+ } while (n == IPOIB_NUM_WC);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+ struct ipoib_cm_tx *tx,
+ unsigned int wr_id,
+ dma_addr_t addr, int len)
+{
+ struct ib_send_wr *bad_wr;
+
+ priv->tx_sge.addr = addr;
+ priv->tx_sge.length = len;
+
+ priv->tx_wr.wr_id = wr_id;
+
+ return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_tx_buf *tx_req;
+ dma_addr_t addr;
+
+ if (unlikely(skb->len > tx->mtu)) {
+ ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+ skb->len, tx->mtu);
+ ++priv->stats.tx_dropped;
+ ++priv->stats.tx_errors;
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ ipoib_dbg_data(priv, "sending packet %p, head %d length=%d connection=%p\n",
+ skb, tx->tx_head, skb->len, tx);
+
+ /*
+ * We put the skb into the tx_ring _before_ we call post_send()
+ * because it's entirely possible that the completion handler will
+ * run before we execute anything after the post_send(). That
+ * means we have to make sure everything is properly recorded and
+ * our state is consistent before we call post_send().
+ */
+ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+ tx_req->skb = skb;
+ addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len,
+ DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(addr))) {
+ ++priv->stats.tx_errors;
+ dev_kfree_skb_any(skb);
+ return;
+ }
+ pci_unmap_addr_set(tx_req, mapping, addr);
+
+ if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+ addr, skb->len))) {
+ ipoib_warn(priv, "post_send failed\n");
+ ++priv->stats.tx_errors;
+ dma_unmap_single(priv->ca->dma_device, addr, skb->len,
+ DMA_TO_DEVICE);
+ dev_kfree_skb_any(skb);
+ } else {
+ dev->trans_start = jiffies;
+ ++tx->tx_head;
+
+ if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
+ ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+ netif_stop_queue(dev);
+ set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+ }
+ }
+}
+
+static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx,
+ struct ib_wc *wc)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned int wr_id = wc->wr_id;
+ struct ipoib_tx_buf *tx_req;
+ unsigned long flags;
+
+ ipoib_dbg_data(priv, "cm send completion: id %d, op %d, status: %d\n",
+ wr_id, wc->opcode, wc->status);
+
+ if (unlikely(wr_id >= ipoib_sendq_size)) {
+ ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_sendq_size);
+ return;
+ }
+
+ tx_req = &tx->tx_ring[wr_id];
+
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(tx_req, mapping),
+ tx_req->skb->len,
+ DMA_TO_DEVICE);
+
+ /* FIXME: is this right? Shouldn't we only increment on success? */
+ ++priv->stats.tx_packets;
+ priv->stats.tx_bytes += tx_req->skb->len;
+
+ dev_kfree_skb_any(tx_req->skb);
+
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ ++tx->tx_tail;
+ if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags) &&
+ tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
+ netif_wake_queue(dev);
+ }
+
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR) {
+ struct ipoib_neigh *neigh;
+
+ ipoib_dbg(priv, "failed cm send event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+
+ spin_lock(&priv->lock);
+ neigh = tx->neigh;
+
+ if (neigh) {
+ neigh->cm = NULL;
+ list_del(&neigh->list);
+ if (neigh->ah)
+ ipoib_put_ah(neigh->ah);
+ ipoib_neigh_free(neigh);
+
+ tx->neigh = NULL;
+ }
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+ list_move(&tx->list, &priv->cm.reap_list);
+ queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ }
+
+ clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+
+ spin_unlock(&priv->lock);
+ }
+
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
+{
+ struct ipoib_cm_tx *tx = tx_ptr;
+ int n, i;
+
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ do {
+ n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
+ for (i = 0; i < n; ++i)
+ ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
+ } while (n == IPOIB_NUM_WC);
+}
+
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int ret;
+
+ if (!IPOIB_CM_ENABLED(dev->dev_addr))
+ return 0;
+
+ priv->cm.cq = ib_create_cq(priv->ca, ipoib_cm_rx_completion, NULL, dev,
+ ipoib_recvq_size + 1);
+ if (IS_ERR(priv->cm.cq)) {
+ printk(KERN_WARNING "%s: failed to create CQ\n", priv->ca->name);
+ return PTR_ERR(priv->cm.cq);
+ }
+
+ ib_req_notify_cq(priv->cm.cq, IB_CQ_NEXT_COMP);
+
+ priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
+ if (IS_ERR(priv->cm.id)) {
+ printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+ ib_destroy_cq(priv->cm.cq);
+ return IS_ERR(priv->cm.id);
+ }
+
+ ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+ 0, NULL);
+ if (ret) {
+ printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+ IPOIB_CM_IETF_ID | priv->qp->qp_num);
+ ib_destroy_cm_id(priv->cm.id);
+ ib_destroy_cq(priv->cm.cq);
+ return ret;
+ }
+ return 0;
+}
+
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_cm_rx *p;
+ unsigned long flags;
+
+ if (!IPOIB_CM_ENABLED(dev->dev_addr))
+ return;
+
+ ib_destroy_cm_id(priv->cm.id);
+ spin_lock_irqsave(&priv->lock, flags);
+ while (!list_empty(&priv->cm.passive_ids)) {
+ p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+ list_del_init(&p->list);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ ib_destroy_cm_id(p->id);
+ ib_destroy_qp(p->qp);
+ kfree(p);
+ spin_lock_irqsave(&priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&priv->lock, flags);
+ ib_destroy_cq(priv->cm.cq);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+ struct ipoib_cm_tx *p = cm_id->context;
+ struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+ struct ipoib_cm_data *data = event->private_data;
+ struct sk_buff_head skqueue;
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ p->mtu = be32_to_cpu(data->mtu);
+
+ if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) {
+ ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n",
+ p->mtu, priv->dev->mtu);
+ return -EINVAL;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+ return ret;
+ }
+
+ qp_attr.rq_psn = 0 /* FIXME */;
+ ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+ return ret;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+ return ret;
+ }
+ ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+ return ret;
+ }
+
+ skb_queue_head_init(&skqueue);
+
+ spin_lock_irqsave(&priv->lock, flags);
+ set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+ if (p->neigh)
+ while ((skb = __skb_dequeue(&p->neigh->queue)))
+ __skb_queue_tail(&skqueue, skb);
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ while ((skb = __skb_dequeue(&skqueue))) {
+ skb->dev = p->dev;
+ if (dev_queue_xmit(skb))
+ ipoib_warn(priv, "dev_queue_xmit failed "
+ "to requeue packet\n");
+ }
+
+ ret = ib_send_cm_rtu(cm_id, NULL, 0);
+ if (ret) {
+ ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_init_attr attr = {};
+ attr.recv_cq = priv->cm.cq;
+ attr.srq = priv->cm.srq;
+ attr.cap.max_send_wr = ipoib_sendq_size;
+ attr.cap.max_send_sge = 1;
+ attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ attr.qp_type = IB_QPT_RC;
+ attr.send_cq = cq;
+ return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_send_req(struct net_device *dev,
+ struct ib_cm_id *id, struct ib_qp *qp,
+ u32 qpn,
+ struct ib_sa_path_rec *pathrec)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_cm_data data = {};
+ struct ib_cm_req_param req = {};
+
+ data.qpn = cpu_to_be32(priv->qp->qp_num);
+ data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+ req.primary_path = pathrec;
+ req.alternate_path = NULL;
+ req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+ req.qp_num = qp->qp_num;
+ req.qp_type = qp->qp_type;
+ req.private_data = &data;
+ req.private_data_len = sizeof data;
+ req.flow_control = 0;
+
+ req.starting_psn = 0; /* FIXME */
+
+ /*
+ * Pick some arbitrary defaults here; we could make these
+ * module parameters if anyone cared about setting them.
+ */
+ req.responder_resources = 4;
+ req.remote_cm_response_timeout = 20;
+ req.local_cm_response_timeout = 20;
+ req.retry_count = 0; /* RFC draft warns against retries */
+ req.rnr_retry_count = 0; /* RFC draft warns against retries */
+ req.max_cm_retries = 15;
+ req.srq = 15;
+ return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct net_device *dev,
+ struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+ ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+ if (ret) {
+ ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret);
+ return ret;
+ }
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+ qp_attr.port_num = priv->port;
+ qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ib_sa_path_rec *pathrec)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+ int ret;
+
+ ipoib_dbg(priv, "Request connection %p for gid " IPOIB_GID_FMT " qpn 0x%x\n",
+ p, IPOIB_GID_ARG(pathrec->dgid), qpn);
+
+ p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring,
+ GFP_KERNEL);
+ if (!p->tx_ring) {
+ ipoib_warn(priv, "failed to allocate tx ring\n");
+ ret = -ENOMEM;
+ goto err_tx;
+ }
+
+ p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
+ ipoib_sendq_size + 1);
+ if (IS_ERR(p->cq)) {
+ ret = PTR_ERR(p->cq);
+ ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
+ goto err_cq;
+ }
+
+ ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
+ goto err_req_notify;
+ }
+
+ p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+ if (IS_ERR(p->qp)) {
+ ret = PTR_ERR(p->qp);
+ ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
+ goto err_qp;
+ }
+
+ p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+ if (IS_ERR(p->id)) {
+ ret = PTR_ERR(p->id);
+ ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+ goto err_id;
+ }
+
+ ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+ goto err_modify;
+ }
+
+ ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
+ if (ret) {
+ ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+ goto err_send_cm;
+ }
+ return 0;
+
+err_send_cm:
+err_modify:
+ ib_destroy_cm_id(p->id);
+err_id:
+ p->id = NULL;
+ ib_destroy_qp(p->qp);
+err_req_notify:
+err_qp:
+ p->qp = NULL;
+ ib_destroy_cq(p->cq);
+err_cq:
+ p->cq = NULL;
+err_tx:
+ return ret;
+}
+
+void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+ struct ipoib_tx_buf *tx_req;
+
+ ipoib_dbg(priv, "Destroy active connection %p. head 0x%x tail 0x%x\n",
+ p, p->tx_head, p->tx_tail);
+
+ if (p->id)
+ ib_destroy_cm_id(p->id);
+
+ if (p->qp)
+ ib_destroy_qp(p->qp);
+
+ if (p->cq)
+ ib_destroy_cq(p->cq);
+
+ if (p->tx_ring) {
+ while ((int) p->tx_tail - (int) p->tx_head < 0) {
+ tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(tx_req, mapping),
+ tx_req->skb->len,
+ DMA_TO_DEVICE);
+ dev_kfree_skb_any(tx_req->skb);
+ ++p->tx_tail;
+ }
+
+ kfree(p->tx_ring);
+ }
+
+ kfree(p);
+}
+
+int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+ struct ipoib_cm_tx *tx = cm_id->context;
+ struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+ struct ipoib_neigh *neigh;
+ unsigned long flags;
+ int ret;
+
+ switch (event->event) {
+ case IB_CM_DREQ_RECEIVED:
+ ipoib_dbg(priv, "DREQ received.\n");
+ ib_send_cm_drep(cm_id, NULL, 0);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ipoib_dbg(priv, "REP received.\n");
+ ret = ipoib_cm_rep_handler(cm_id, event);
+ if (ret)
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ break;
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REJ_RECEIVED:
+ case IB_CM_TIMEWAIT_EXIT:
+ ipoib_dbg(priv, "CM error %d.\n", event->event);
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ spin_lock(&priv->lock);
+ neigh = tx->neigh;
+
+ if (neigh) {
+ neigh->cm = NULL;
+ list_del(&neigh->list);
+ if (neigh->ah)
+ ipoib_put_ah(neigh->ah);
+ ipoib_neigh_free(neigh);
+
+ tx->neigh = NULL;
+ }
+
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+ list_move(&tx->list, &priv->cm.reap_list);
+ queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ }
+
+ spin_unlock(&priv->lock);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+ struct ipoib_neigh *neigh)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_cm_tx *tx;
+
+ tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+ if (!tx)
+ return NULL;
+
+ neigh->cm = tx;
+ tx->neigh = neigh;
+ tx->path = path;
+ tx->dev = dev;
+ list_add(&tx->list, &priv->cm.start_list);
+ set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+ queue_work(ipoib_workqueue, &priv->cm.start_task);
+ return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+ list_move(&tx->list, &priv->cm.reap_list);
+ queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(tx->neigh->dgid));
+ tx->neigh = NULL;
+ }
+}
+
+void ipoib_cm_tx_start(void *dev_ptr)
+{
+ struct net_device *dev = dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_neigh *neigh;
+ struct ipoib_cm_tx *p;
+ unsigned long flags;
+ int ret;
+
+ struct ib_sa_path_rec pathrec;
+ u32 qpn;
+
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ spin_lock(&priv->lock);
+ while (!list_empty(&priv->cm.start_list)) {
+ p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+ list_del_init(&p->list);
+ neigh = p->neigh;
+ qpn = IPOIB_QPN(neigh->neighbour->ha);
+ memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+ spin_unlock(&priv->lock);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+ ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ spin_lock(&priv->lock);
+ if (ret) {
+ neigh = p->neigh;
+ if (neigh) {
+ neigh->cm = NULL;
+ list_del(&neigh->list);
+ if (neigh->ah)
+ ipoib_put_ah(neigh->ah);
+ ipoib_neigh_free(neigh);
+ }
+ list_del(&p->list);
+ kfree(p);
+ }
+ }
+ spin_unlock(&priv->lock);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+void ipoib_cm_tx_reap(void *dev_ptr)
+{
+ struct net_device *dev = dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_cm_tx *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ spin_lock(&priv->lock);
+ while (!list_empty(&priv->cm.reap_list)) {
+ p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+ list_del(&p->list);
+ spin_unlock(&priv->lock);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+ ipoib_cm_tx_destroy(p);
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ spin_lock(&priv->lock);
+ }
+ spin_unlock(&priv->lock);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_srq_init_attr srq_init_attr = {
+ .attr = {
+ .max_wr = ipoib_recvq_size,
+ .max_sge = 1
+ }
+ };
+ int ret, i;
+
+ INIT_LIST_HEAD(&priv->cm.passive_ids);
+ INIT_LIST_HEAD(&priv->cm.reap_list);
+ INIT_LIST_HEAD(&priv->cm.start_list);
+ INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start, dev);
+ INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap, dev);
+
+ priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+ if (IS_ERR(priv->cm.srq)) {
+ ret = PTR_ERR(priv->cm.srq);
+ priv->cm.srq = NULL;
+ return ret;
+ }
+
+ priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
+ GFP_KERNEL);
+ if (!priv->cm.srq_ring) {
+ printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n",
+ priv->ca->name, ipoib_recvq_size);
+ ipoib_cm_dev_cleanup(dev);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ipoib_recvq_size; ++i) {
+ if (ipoib_cm_alloc_rx_skb(dev, i)) {
+ ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+ ipoib_cm_dev_cleanup(dev);
+ return -ENOMEM;
+ }
+ if (ipoib_cm_post_receive(dev, i)) {
+ ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+ ipoib_cm_dev_cleanup(dev);
+ return -EIO;
+ }
+ }
+
+ priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
+
+ return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int i, ret;
+
+ ipoib_dbg(priv, "Cleanup ipoib connected mode data.\n");
+
+ if (!priv->cm.srq)
+ return;
+ ret = ib_destroy_srq(priv->cm.srq);
+ if (ret)
+ ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+ priv->cm.srq = NULL;
+ if (!priv->cm.srq_ring)
+ return;
+ for (i = 0; i < ipoib_recvq_size; ++i)
+ if (priv->cm.srq_ring[i].skb) {
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(&priv->cm.srq_ring[i],
+ mapping),
+ IPOIB_CM_BUF_SIZE,
+ DMA_FROM_DEVICE);
+ dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
+ priv->cm.srq_ring[i].skb = NULL;
+ }
+ kfree(priv->cm.srq_ring);
+ priv->cm.srq_ring = NULL;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 8bf5e9e..a4b2d21 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -119,6 +119,7 @@ static int ipoib_ib_post_receive(struct net_device *dev, int id)
return ret;
}
+
static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -273,10 +274,10 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->tx_lock, flags);
++priv->tx_tail;
- if (netif_queue_stopped(dev) &&
- test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
- priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+ if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags) &&
+ priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
netif_wake_queue(dev);
+ }
spin_unlock_irqrestore(&priv->tx_lock, flags);
if (wc->status != IB_WC_SUCCESS &&
@@ -378,6 +379,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
netif_stop_queue(dev);
+ set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
}
}
}
@@ -429,6 +431,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
return -1;
}
+ ret = ipoib_cm_dev_open(dev);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+ ipoib_ib_dev_stop(dev);
+ return -1;
+ }
+
clear_bit(IPOIB_STOP_REAPER, &priv->flags);
queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
@@ -514,6 +523,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+ ipoib_cm_dev_stop(dev);
+
/*
* Move our QP to the error state and then reinitialize in
* when all work requests have completed or have been flushed.
@@ -603,6 +614,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
return -ENODEV;
}
+ ipoib_cm_dev_init(dev);
+
if (dev->flags & IFF_UP) {
if (ipoib_ib_dev_open(dev)) {
ipoib_transport_dev_cleanup(dev);
@@ -659,6 +672,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
ipoib_mcast_stop_thread(dev, 1);
ipoib_mcast_dev_flush(dev);
+ ipoib_cm_dev_cleanup(dev);
ipoib_transport_dev_cleanup(dev);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 85522da..282c5ea 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -49,8 +49,6 @@
#include <net/dst.h>
-#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
-
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");
@@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev)
netif_stop_queue(dev);
+ clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+
/*
* Now flush workqueue to make sure a scheduled task doesn't
* bring our internal state back up.
@@ -177,14 +177,27 @@ static int ipoib_stop(struct net_device *dev)
static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int old_mtu = dev->mtu;
+
+ /* Simple heuristic: dev->mtu > 2K ==> connected mode */
+ /* flush paths if we switch modes so that connections are restarted */
+ if (IPOIB_CM_ENABLED(dev->dev_addr) &&
+ new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN &&
+ new_mtu <= IPOIB_CM_MTU) {
+ dev->mtu = new_mtu;
+ if (old_mtu <= IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+ ipoib_flush_paths(dev);
+ return 0;
+ }
if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
- return -EINVAL;
+ return -EINVAL;
priv->admin_mtu = new_mtu;
-
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+ if (old_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+ ipoib_flush_paths(dev);
return 0;
}
@@ -414,6 +427,18 @@ static void path_rec_completion(int status,
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
sizeof(union ib_gid));
+ if (ipoib_cm_enabled(dev, neigh->neighbour)) {
+ if (!neigh->cm)
+ neigh->cm = ipoib_cm_create_tx(dev, path, neigh);
+ if (!neigh->cm) {
+ list_del(&neigh->list);
+ if (neigh->ah)
+ ipoib_put_ah(neigh->ah);
+ ipoib_neigh_free(neigh);
+ continue;
+ }
+ }
+
while ((skb = __skb_dequeue(&neigh->queue)))
__skb_queue_tail(&skqueue, skb);
}
@@ -522,7 +547,22 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
sizeof(union ib_gid));
- ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+ if (ipoib_cm_enabled(dev, neigh->neighbour)) {
+ if (!neigh->cm)
+ neigh->cm = ipoib_cm_create_tx(dev, path, neigh);
+ if (!neigh->cm) {
+ list_del(&neigh->list);
+ if (neigh->ah)
+ ipoib_put_ah(neigh->ah);
+ ipoib_neigh_free(neigh);
+ goto err_drop;
+ }
+ if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+ __skb_queue_tail(&neigh->queue, skb);
+ else
+ goto err_drop;
+ } else
+ ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
} else {
neigh->ah = NULL;
__skb_queue_tail(&neigh->queue, skb);
@@ -539,6 +579,7 @@ err_list:
err_path:
ipoib_neigh_free(neigh);
+err_drop:
++priv->stats.tx_dropped;
dev_kfree_skb_any(skb);
@@ -641,7 +682,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
neigh = *to_ipoib_neigh(skb->dst->neighbour);
- if (likely(neigh->ah)) {
+ if (ipoib_cm_get(neigh)) {
+ if (test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags)) {
+ ipoib_cm_send(dev, skb, neigh->cm);
+ goto out;
+ }
+ } else if (neigh->ah) {
if (unlikely(memcmp(&neigh->dgid.raw,
skb->dst->neighbour->ha + 4,
sizeof(union ib_gid)))) {
@@ -805,6 +851,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
neigh->neighbour = neighbour;
*to_ipoib_neigh(neighbour) = neigh;
+ neigh->cm = NULL;
return neigh;
}
@@ -812,6 +859,8 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
void ipoib_neigh_free(struct ipoib_neigh *neigh)
{
*to_ipoib_neigh(neigh->neighbour) = NULL;
+ if (neigh->cm)
+ ipoib_cm_destroy_tx(neigh->cm);
kfree(neigh);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3faa182..14337e9 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -594,7 +594,11 @@ void ipoib_mcast_join_task(void *dev_ptr)
priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
IPOIB_ENCAP_LEN;
- dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+ /* Simple heuristic: dev->mtu > 2K ==> connected mode.
+ * In this case do not touch dev->mtu. */
+ if (dev->mtu <= IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+ dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
--
MST
More information about the general
mailing list