[ewg] [PATCH 7/8 v3] mlx4: Add support for RDMAoE - address resolution

Eli Cohen eli at mellanox.co.il
Mon Jul 13 11:17:43 PDT 2009


The following path handles address vectors creation for RDMAoE ports. mlx4 needs
the MAC address of the remote node to include it in the WQE of a UD QP or in
the QP context of connected QPs. Address resolution is done atomically in the
case of a link local address or using service from core/addr.c to do that. mlx4
transport packets were changed too to accomodate for RDMAoE.

Signed-off-by: Eli Cohen <eli at mellanox.co.il>
---
 drivers/infiniband/hw/mlx4/ah.c      |  228 ++++++++++++++++++++++++++-----
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   30 ++++-
 drivers/infiniband/hw/mlx4/qp.c      |  253 +++++++++++++++++++++++++++-------
 include/linux/mlx4/device.h          |   31 ++++-
 include/linux/mlx4/qp.h              |    8 +-
 5 files changed, 463 insertions(+), 87 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..c994e1f 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,200 @@
  */
 
 #include "mlx4_ib.h"
+#include <rdma/ib_addr.h>
+#include <linux/inet.h>
+#include <linux/string.h>
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+static struct rdma_addr_client addr_client;
+
+struct resolve_ctx {
+	struct completion done;
+	int status;
+};
+
+
+static int status2err(int status)
 {
-	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
-	struct mlx4_ib_ah *ah;
+	return status; /* TBD */
+}
 
-	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-	if (!ah)
-		return ERR_PTR(-ENOMEM);
+static void resolve_callback(int status, struct sockaddr *src_addr,
+			     struct rdma_dev_addr *addr, void *context)
+{
+	struct resolve_ctx *ctx = context;
 
-	memset(&ah->av, 0, sizeof ah->av);
+	ctx->status = status;
+	complete(&ctx->done);
+}
 
-	ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
-	ah->av.g_slid  = ah_attr->src_path_bits;
-	ah->av.dlid    = cpu_to_be16(ah_attr->dlid);
-	if (ah_attr->static_rate) {
-		ah->av.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
-		while (ah->av.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
-		       !(1 << ah->av.stat_rate & dev->caps.stat_rate_support))
-			--ah->av.stat_rate;
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
+			u8 *mac, int *is_mcast)
+{
+	struct mlx4_ib_rdmaoe *rdmaoe = &dev->rdmaoe;
+	struct sockaddr_in6 dst = {0};
+	struct rdma_dev_addr addr;
+	struct resolve_ctx ctx;
+	struct net_device *netdev;
+	int err = 0;
+	int ifidx;
+
+	*is_mcast = 0;
+	spin_lock(&rdmaoe->lock);
+	netdev = rdmaoe->netdevs[ah_attr->port_num - 1];
+	if (!netdev) {
+		spin_unlock(&rdmaoe->lock);
+		return -EINVAL;
+	}
+	ifidx = netdev->ifindex;
+	spin_unlock(&rdmaoe->lock);
+
+	init_completion(&ctx.done);
+	memcpy(dst.sin6_addr.s6_addr, ah_attr->grh.dgid.raw, sizeof ah_attr->grh);
+	dst.sin6_family = AF_INET6;
+	dst.sin6_scope_id = ifidx;
+	if (rdma_link_local_addr(&dst.sin6_addr))
+		rdma_get_ll_mac(&dst.sin6_addr, mac);
+	else if (rdma_is_multicast_addr(&dst.sin6_addr)) {
+		rdma_get_mcast_mac(&dst.sin6_addr, mac);
+		*is_mcast = 1;
+	} else {
+		err = rdma_resolve_ip(&addr_client, NULL, (struct sockaddr *)&dst, &addr,
+				      2000, resolve_callback, &ctx);
+		if (!err)
+			wait_for_completion(&ctx.done);
+		else
+			ctx.status = err;
+
+		err = status2err(ctx.status);
 	}
-	ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+	return err;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				  struct mlx4_ib_ah *ah)
+{
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.ib.g_slid  = ah_attr->src_path_bits;
 	if (ah_attr->ah_flags & IB_AH_GRH) {
-		ah->av.g_slid   |= 0x80;
-		ah->av.gid_index = ah_attr->grh.sgid_index;
-		ah->av.hop_limit = ah_attr->grh.hop_limit;
-		ah->av.sl_tclass_flowlabel |=
+		ah->av.ib.g_slid   |= 0x80;
+		ah->av.ib.gid_index = ah_attr->grh.sgid_index;
+		ah->av.ib.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.ib.sl_tclass_flowlabel |=
 			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
 				    ah_attr->grh.flow_label);
-		memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16);
+		memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
 	}
 
+	ah->av.ib.dlid    = cpu_to_be16(ah_attr->dlid);
+	if (ah_attr->static_rate) {
+		ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.ib.stat_rate;
+	}
+	ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
 	return &ah->ibah;
 }
 
+static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				   struct mlx4_ib_ah *ah)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+	struct mlx4_dev *dev = ibdev->dev;
+	u8 mac[6];
+	int err;
+	int is_mcast;
+
+	err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast);
+	if (err)
+		return ERR_PTR(err);
+
+	memcpy(ah->av.eth.mac_0_1, mac, 2);
+	memcpy(ah->av.eth.mac_2_5, mac + 2, 4);
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.ib.g_slid = 0x80;
+	if (ah_attr->static_rate) {
+		ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.ib.stat_rate;
+	}
+
+	/*
+	 * HW requires multicast LID so we just choose one.
+	 */
+	if (is_mcast)
+		ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+	memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
+	ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah;
+	enum ib_port_link_type link_type;
+	struct ib_ah *ret;
+
+	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	link_type = ib_get_port_link_type(pd->device, ah_attr->port_num);
+	if (link_type == PORT_LINK_ETH) {
+		if (!(ah_attr->ah_flags & IB_AH_GRH)) {
+			ret = ERR_PTR(-EINVAL);
+			goto out;
+		} else {
+			/* TBD: need to handle the case when we get called
+			in an atomic context and there we might sleep. We
+			don't expect this currently since we're working with
+			link local addresses which we can translate without
+			going to sleep */
+			ret = create_rdmaoe_ah(pd, ah_attr, ah);
+			if (IS_ERR(ret))
+				goto out;
+			else
+				return ret;
+		}
+	} else
+		return create_ib_ah(pd, ah_attr, ah); /* never fails */
+
+out:
+	kfree(ah);
+	return ret;
+}
+
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
 {
 	struct mlx4_ib_ah *ah = to_mah(ibah);
+	enum ib_port_link_type lt;
 
+	lt = ib_get_port_link_type(ibah->device, ah_attr->port_num);
 	memset(ah_attr, 0, sizeof *ah_attr);
-	ah_attr->dlid	       = be16_to_cpu(ah->av.dlid);
-	ah_attr->sl	       = be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
-	ah_attr->port_num      = be32_to_cpu(ah->av.port_pd) >> 24;
-	if (ah->av.stat_rate)
-		ah_attr->static_rate = ah->av.stat_rate - MLX4_STAT_RATE_OFFSET;
-	ah_attr->src_path_bits = ah->av.g_slid & 0x7F;
+	ah_attr->dlid	       = lt == PORT_LINK_IB ? be16_to_cpu(ah->av.ib.dlid) : 0;
+	ah_attr->sl	       = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+	ah_attr->port_num      = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+	if (ah->av.ib.stat_rate)
+		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
+	ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F;
 
 	if (mlx4_ib_ah_grh_present(ah)) {
 		ah_attr->ah_flags = IB_AH_GRH;
 
 		ah_attr->grh.traffic_class =
-			be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20;
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20;
 		ah_attr->grh.flow_label =
-			be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff;
-		ah_attr->grh.hop_limit  = ah->av.hop_limit;
-		ah_attr->grh.sgid_index = ah->av.gid_index;
-		memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16);
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff;
+		ah_attr->grh.hop_limit  = ah->av.ib.hop_limit;
+		ah_attr->grh.sgid_index = ah->av.ib.gid_index;
+		memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16);
 	}
 
 	return 0;
@@ -98,3 +235,30 @@ int mlx4_ib_destroy_ah(struct ib_ah *ah)
 	kfree(to_mah(ah));
 	return 0;
 }
+
+int mlx4_ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac)
+{
+	int err;
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct ib_ah_attr ah_attr = {
+		.port_num = port,
+	};
+	int is_mcast;
+
+	memcpy(ah_attr.grh.dgid.raw, gid, 16);
+	err = mlx4_ib_resolve_grh(ibdev, &ah_attr, mac, &is_mcast);
+	if (err)
+		ERR_PTR(err);
+
+	return 0;
+}
+
+void mlx4_ib_addr_init(void)
+{
+	rdma_addr_register_client(&addr_client);
+}
+
+void mlx4_ib_addr_cleanup(void)
+{
+	rdma_addr_unregister_client(&addr_client);
+}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 8a7dd67..e2d6e62 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -138,6 +138,7 @@ struct mlx4_ib_qp {
 	u8			resp_depth;
 	u8			sq_no_prefetch;
 	u8			state;
+	int			mlx_type;
 };
 
 struct mlx4_ib_srq {
@@ -157,7 +158,15 @@ struct mlx4_ib_srq {
 
 struct mlx4_ib_ah {
 	struct ib_ah		ibah;
-	struct mlx4_av		av;
+	union mlx4_ext_av       av;
+};
+
+struct mlx4_ib_rdmaoe {
+	spinlock_t		lock;
+	struct net_device      *netdevs[MLX4_MAX_PORTS];
+	int			enstate[MLX4_MAX_PORTS];
+	int			mtu[MLX4_MAX_PORTS];
+	struct notifier_block 	nb;
 };
 
 struct mlx4_ib_dev {
@@ -175,6 +184,8 @@ struct mlx4_ib_dev {
 	spinlock_t		sm_lock;
 
 	struct mutex		cap_mask_mutex;
+
+	struct mlx4_ib_rdmaoe	rdmaoe;
 };
 
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
@@ -312,10 +323,25 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
 			 u64 iova);
 int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
 int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+void mlx4_ib_addr_init(void);
+void mlx4_ib_addr_cleanup(void);
+
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
+			u8 *mac, int *is_mcast);
+
+int mlx4_ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac);
 
 static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
 {
-	return !!(ah->av.g_slid & 0x80);
+	/*
+	 * port number is located at the same place for both IB and Eth
+	 */
+	u8 port = (ah->av.ib.port_pd >> 24) & 3;
+
+	if (ib_get_port_link_type(ah->ibah.device, port) == PORT_LINK_ETH)
+		return 1;
+	else
+		return !!(ah->av.ib.g_slid & 0x80);
 }
 
 #endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 20724ae..7a6b765 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/log2.h>
+#include <linux/netdevice.h>
 
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
@@ -47,14 +48,21 @@ enum {
 
 enum {
 	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
-	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX4_IB_LINK_TYPE_IB		= 0,
+	MLX4_IB_LINK_TYPE_ETH		= 1
 };
 
 enum {
 	/*
 	 * Largest possible UD header: send with GRH and immediate data.
+	 * 4 bytes added to accommodate for eth header instead of lrh
 	 */
-	MLX4_IB_UD_HEADER_SIZE		= 72
+	MLX4_IB_UD_HEADER_SIZE		= 76
+};
+
+enum {
+	MLX4_RDMAOE_ETHERTYPE = 0x8915
 };
 
 struct mlx4_ib_sqp {
@@ -62,7 +70,10 @@ struct mlx4_ib_sqp {
 	int			pkey_index;
 	u32			qkey;
 	u32			send_psn;
-	struct ib_ud_header	ud_header;
+	union {
+		struct ib_ud_header	ib;
+		struct eth_ud_header	eth;
+	} hdr;
 	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
 };
 
@@ -782,18 +793,6 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
 	return 0;
 }
 
-static int to_mlx4_st(enum ib_qp_type type)
-{
-	switch (type) {
-	case IB_QPT_RC:		return MLX4_QP_ST_RC;
-	case IB_QPT_UC:		return MLX4_QP_ST_UC;
-	case IB_QPT_UD:		return MLX4_QP_ST_UD;
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:	return MLX4_QP_ST_MLX;
-	default:		return -1;
-	}
-}
-
 static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
 				   int attr_mask)
 {
@@ -843,6 +842,12 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 			 struct mlx4_qp_path *path, u8 port)
 {
+	int err;
+	int is_eth = ib_get_port_link_type(&dev->ib_dev, port) ==
+							PORT_LINK_ETH ? 1 : 0;
+	u8 mac[6];
+	int is_mcast;
+
 	path->grh_mylmc     = ah->src_path_bits & 0x7f;
 	path->rlid	    = cpu_to_be16(ah->dlid);
 	if (ah->static_rate) {
@@ -873,9 +878,36 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 	path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
 		((port - 1) << 6) | ((ah->sl & 0xf) << 2);
 
+	if (is_eth) {
+		if (!(ah->ah_flags & IB_AH_GRH))
+			return -1;
+
+		err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast);
+		if (err)
+			return err;
+
+		memcpy(path->dmac_h, mac, 2);
+		memcpy(path->dmac_l, mac + 2, 4);
+		path->ackto = MLX4_IB_LINK_TYPE_ETH;
+		/* use index 0 into MAC table for RDMAoE */
+		path->grh_mylmc &= 0x80;
+	}
+
 	return 0;
 }
 
+static int to_mlx4_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:         return MLX4_QP_ST_RC;
+	case IB_QPT_UC:         return MLX4_QP_ST_UC;
+	case IB_QPT_UD:         return MLX4_QP_ST_UD;
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:        return MLX4_QP_ST_MLX;
+	default:                return -1;
+	}
+}
+
 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -972,7 +1004,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	}
 
 	if (attr_mask & IB_QP_TIMEOUT) {
-		context->pri_path.ackto = attr->timeout << 3;
+		context->pri_path.ackto |= (attr->timeout << 3);
 		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
 	}
 
@@ -1206,8 +1238,8 @@ out:
 	return err;
 }
 
-static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
-			    void *wqe, unsigned *mlx_seg_len)
+static int build_ib_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			       void *wqe, unsigned *mlx_seg_len)
 {
 	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
 	struct mlx4_wqe_mlx_seg *mlx = wqe;
@@ -1223,61 +1255,171 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 	for (i = 0; i < wr->num_sge; ++i)
 		send_size += wr->sg_list[i].length;
 
-	ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->ud_header);
+	ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->hdr.ib);
 
-	sqp->ud_header.lrh.service_level   =
-		be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
-	sqp->ud_header.lrh.destination_lid = ah->av.dlid;
-	sqp->ud_header.lrh.source_lid      = cpu_to_be16(ah->av.g_slid & 0x7f);
+	sqp->hdr.ib.lrh.service_level   =
+		be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+	sqp->hdr.ib.lrh.destination_lid = ah->av.ib.dlid;
+	sqp->hdr.ib.lrh.source_lid      = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
 	if (mlx4_ib_ah_grh_present(ah)) {
-		sqp->ud_header.grh.traffic_class =
-			(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
-		sqp->ud_header.grh.flow_label    =
-			ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
-		sqp->ud_header.grh.hop_limit     = ah->av.hop_limit;
-		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
-				  ah->av.gid_index, &sqp->ud_header.grh.source_gid);
-		memcpy(sqp->ud_header.grh.destination_gid.raw,
-		       ah->av.dgid, 16);
+		sqp->hdr.ib.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->hdr.ib.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->hdr.ib.grh.hop_limit     = ah->av.ib.hop_limit;
+		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+				  ah->av.ib.gid_index, &sqp->hdr.ib.grh.source_gid);
+		memcpy(sqp->hdr.ib.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
 	}
 
 	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
-				  (sqp->ud_header.lrh.destination_lid ==
+				  (sqp->hdr.ib.lrh.destination_lid ==
 				   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
-				  (sqp->ud_header.lrh.service_level << 8));
-	mlx->rlid   = sqp->ud_header.lrh.destination_lid;
+				  (sqp->hdr.ib.lrh.service_level << 8));
+	mlx->rlid   = sqp->hdr.ib.lrh.destination_lid;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->hdr.ib.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
+		sqp->hdr.ib.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->hdr.ib.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->hdr.ib.immediate_present = 1;
+		sqp->hdr.ib.immediate_data    = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->hdr.ib.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->hdr.ib.lrh.destination_lid == IB_LID_PERMISSIVE)
+		sqp->hdr.ib.lrh.source_lid = IB_LID_PERMISSIVE;
+	sqp->hdr.ib.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+	sqp->hdr.ib.bth.pkey = cpu_to_be16(pkey);
+	sqp->hdr.ib.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->hdr.ib.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->hdr.ib.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->hdr.ib.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->hdr.ib, sqp->header_buf);
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static int build_eth_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+				void *wqe, unsigned *mlx_seg_len)
+{
+	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+	void *tmp;
+
+	send_size = 0;
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	ib_rdmaoe_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), &sqp->hdr.eth);
+
+	if (mlx4_ib_ah_grh_present(ah)) {
+		sqp->hdr.eth.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->hdr.eth.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->hdr.eth.grh.hop_limit     = ah->av.ib.hop_limit;
+		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+				  ah->av.ib.gid_index, &sqp->hdr.eth.grh.source_gid);
+		memcpy(sqp->hdr.eth.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
 	switch (wr->opcode) {
 	case IB_WR_SEND:
-		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
-		sqp->ud_header.immediate_present = 0;
+		sqp->hdr.eth.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
+		sqp->hdr.eth.immediate_present = 0;
 		break;
 	case IB_WR_SEND_WITH_IMM:
-		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
-		sqp->ud_header.immediate_present = 1;
-		sqp->ud_header.immediate_data    = wr->ex.imm_data;
+		sqp->hdr.eth.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->hdr.eth.immediate_present = 1;
+		sqp->hdr.eth.immediate_data    = wr->ex.imm_data;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
-	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
-		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
-	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	memcpy(sqp->hdr.eth.eth.dmac_h, ah->av.eth.mac_0_1, 2);
+	memcpy(sqp->hdr.eth.eth.dmac_h + 2, ah->av.eth.mac_2_5, 2);
+	memcpy(sqp->hdr.eth.eth.dmac_l, ah->av.eth.mac_2_5 + 2, 2);
+	tmp = to_mdev(sqp->qp.ibqp.device)->rdmaoe.netdevs[sqp->qp.port - 1]->dev_addr;
+	memcpy(sqp->hdr.eth.eth.smac_h, tmp, 2);
+	memcpy(sqp->hdr.eth.eth.smac_l, tmp + 2, 4);
+	sqp->hdr.eth.eth.type = cpu_to_be16(MLX4_RDMAOE_ETHERTYPE);
+	sqp->hdr.eth.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
 	if (!sqp->qp.ibqp.qp_num)
 		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
 	else
 		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
-	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
-	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
-	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
-	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+	sqp->hdr.eth.bth.pkey = cpu_to_be16(pkey);
+	sqp->hdr.eth.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->hdr.eth.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->hdr.eth.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
 					       sqp->qkey : wr->wr.ud.remote_qkey);
-	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+	sqp->hdr.eth.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
 
-	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+	header_size = rdmaoe_ud_header_pack(&sqp->hdr.eth, sqp->header_buf);
 
 	if (0) {
 		printk(KERN_ERR "built UD header of size %d:\n", header_size);
@@ -1333,6 +1475,15 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 	return 0;
 }
 
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	if (ib_get_port_link_type(sqp->qp.ibqp.device, sqp->qp.port) == PORT_LINK_IB)
+		return build_ib_mlx_header(sqp, wr, wqe, mlx_seg_len);
+	else
+		return build_eth_mlx_header(sqp, wr, wqe, mlx_seg_len);
+}
+
 static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
 {
 	unsigned cur;
@@ -1414,6 +1565,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
 	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
 	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
 	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+	memcpy(dseg->mac_0_1, to_mah(wr->wr.ud.ah)->av.eth.mac_0_1, 6);
 }
 
 static void set_mlx_icrc_seg(void *dseg)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 3aff8a6..b73b5f0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -66,7 +66,8 @@ enum {
 	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
 	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1 << 19,
 	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1 << 20,
-	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21,
+	MLX4_DEV_CAP_FLAG_RDMAOE	= 1 << 30
 };
 
 enum {
@@ -371,6 +372,28 @@ struct mlx4_av {
 	u8			dgid[16];
 };
 
+struct mlx4_eth_av {
+	__be32		port_pd;
+	u8		reserved1;
+	u8		smac_idx;
+	u16		reserved2;
+	u8		reserved3;
+	u8		gid_index;
+	u8		stat_rate;
+	u8		hop_limit;
+	__be32		sl_tclass_flowlabel;
+	u8		dgid[16];
+	u32		reserved4[2];
+	__be16		vlan;
+	u8		mac_0_1[2];
+	u8		mac_2_5[4];
+};
+
+union mlx4_ext_av {
+	struct mlx4_av		ib;
+	struct mlx4_eth_av	eth;
+};
+
 struct mlx4_dev {
 	struct pci_dev	       *pdev;
 	unsigned long		flags;
@@ -399,6 +422,12 @@ struct mlx4_init_port_param {
 		if (((type) == MLX4_PORT_TYPE_IB ? (dev)->caps.port_mask : \
 		     ~(dev)->caps.port_mask) & 1 << ((port) - 1))
 
+#define mlx4_foreach_ib_transport_port(port, dev)			\
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
+		if (((dev)->caps.port_mask & 1 << ((port) - 1)) ||	\
+		    ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_RDMAOE))
+
+
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		   struct mlx4_buf *buf);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index bf8f119..d73534f 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -112,7 +112,9 @@ struct mlx4_qp_path {
 	u8			snooper_flags;
 	u8			reserved3[2];
 	u8			counter_index;
-	u8			reserved4[7];
+	u8			reserved4;
+	u8			dmac_h[2];
+	u8			dmac_l[4];
 };
 
 struct mlx4_qp_context {
@@ -218,7 +220,9 @@ struct mlx4_wqe_datagram_seg {
 	__be32			av[8];
 	__be32			dqpn;
 	__be32			qkey;
-	__be32			reservd[2];
+	__be16			vlan;
+	u8			mac_0_1[2];
+	u8			mac_2_5[4];
 };
 
 struct mlx4_wqe_lso_seg {
-- 
1.6.3.3




More information about the ewg mailing list