[ofw] patch 1/2 Add support for RDMAoEth to the low level driver

Tzachi Dar tzachid at mellanox.co.il
Thu Jan 21 09:42:39 PST 2010


Hi James,
 
I'm currently working on some urgent issue so I don't have time to look
at it right now.
 
In any case, please note that I'm looking for a solution that will allow
people to have 3 modes of work:
eth only
ib only
LLE - which actually means that eth as well as IB will work (IB over eth
though).
 
This is the reason that I have also put the code in mlx4_register_mac
under comment as well.
 
Thanks
Tzachi


________________________________

	From: James Yang [mailto:jyang at xsigo.com] 
	Sent: Tuesday, January 19, 2010 8:37 PM
	To: Tzachi Dar; ofw at lists.openfabrics.org
	Subject: RE: [ofw] patch 1/2 Add support for RDMAoEth to the low
level driver
	
	
	Need the following patch so that it will also work on IB mode.
HCA pdo was created twice in the old patch and failed driver start up.
	 
	By the way, it seems ipoib didn't work well at least in win2008
with the patch. Interface mismatch?
	 
	Thanks,
	James
	 
	Index: drv.c
	
===================================================================
	--- drv.c (revision 2617)
	+++ drv.c (working copy)
	@@ -95,7 +95,6 @@
	 
	 #endif
	 
	-static 
	 NTSTATUS
	 __create_child(
	  __in WDFDEVICE  Device,
	@@ -228,7 +227,7 @@
	 
	  if ( p_fdo->children_created )
	   goto end;
	- 
	+
	  // eventually we'll have all information about children in
Registry
	  // DriverEntry will read it into a Global storage and
	  // this routine will create all the children on base on this
info
	@@ -244,6 +243,16 @@
	                      break;
	                 }
	                 eth_created = TRUE;
	+    
	+    //For now we it's either IB or ETH, and we always create
LLE if it's ETH
	+    if((number_of_ib_ports > 0) && (mdev->caps.port_type[1] ==
MLX4_PORT_TYPE_IB) ) {
	+     status = __create_child(Device, BUS_HARDWARE_IDS,
BUS_HARDWARE_DESCRIPTION, 0 );
	+     if (!NT_SUCCESS(status)) {
	+       MLX4_PRINT_EV(TRACE_LEVEL_ERROR, MLX4_DBG_DRV,
("__create_child (ib)failed with 0x%x\n", status));
	+       break;
	+     }
	+     ib_created = TRUE;
	+    }
	             } else {
	                 if (eth_created){
	                     //
	@@ -869,6 +878,9 @@
	   goto err;
	  }
	 
	+ pdev->p_wdf_device = Device;
	+ pdev->ib_hca_created = 0;
	+
	  // start the card
	  status = __start_card(Device, p_fdo );
	  if( !NT_SUCCESS( status ) ) 
	

________________________________

	From: ofw-bounces at lists.openfabrics.org
[mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Tzachi Dar
	Sent: Wednesday, December 02, 2009 8:22 AM
	To: ofw at lists.openfabrics.org
	Subject: [ofw] patch 1/2 Add support for RDMAoEth to the low
level driver
	
	
	Here are the changes in the low level driver:
	 
	Thanks
	Tzachi
	 
	Index: hw/mlx4/kernel/bus/core/ud_header.c
	
===================================================================
	--- hw/mlx4/kernel/bus/core/ud_header.c (revision 2617)
	+++ hw/mlx4/kernel/bus/core/ud_header.c (working copy)
	@@ -62,6 +62,15 @@
	  { STRUCT_FIELD_INIT(lrh, source_lid, 1, 16, 16) }
	 };
	 
	+static const struct ib_field eth_table[]  = {
	+ { STRUCT_FIELD_INIT(eth, dmac_h, 0, 0, 32) },
	+ { STRUCT_FIELD_INIT(eth, dmac_l, 1, 0, 16) },
	+ { STRUCT_FIELD_INIT(eth, smac_h, 1, 16,16) },
	+ { STRUCT_FIELD_INIT(eth, smac_l, 2, 0 ,32) },
	+ { STRUCT_FIELD_INIT(eth, type, 3, 0, 16)}
	+};
	+
	+
	 static const struct ib_field grh_table[]  = {
	  { STRUCT_FIELD_INIT(grh, ip_version, 0, 0, 4) },
	  { STRUCT_FIELD_INIT(grh, traffic_class, 0, 4, 8) },
	@@ -279,3 +288,93 @@
	  return 0;
	 }
	 EXPORT_SYMBOL(ib_ud_header_unpack);
	+
	+/**
	+ * ib_rdmaoe_ud_header_init - Initialize UD header structure
	+ * @payload_bytes:Length of packet payload
	+ * @grh_present:GRH flag (if non-zero, GRH will be included)
	+ * @header:Structure to initialize
	+ *
	+ * ib_rdmaoe_ud_header_init() initializes the grh.ip_version,
grh.payload_length,
	+ * grh.next_header, bth.opcode, bth.pad_count and
	+ * bth.transport_header_version fields of a &struct
eth_ud_header given
	+ * the payload length and whether a GRH will be included.
	+ */
	+void ib_rdmaoe_ud_header_init(int           payload_bytes,
	+      int          grh_present,
	+      struct eth_ud_header    *header)
	+{
	+ int header_len;
	+
	+ memset(header, 0, sizeof *header);
	+
	+ header_len =
	+  sizeof header->eth  +
	+  IB_BTH_BYTES  +
	+  IB_DETH_BYTES;
	+ if (grh_present)
	+  header_len += IB_GRH_BYTES;
	+
	+ header->grh_present          = grh_present;
	+ if (grh_present) {
	+  header->grh.ip_version      = 6;
	+  header->grh.payload_length  =
	+   cpu_to_be16((IB_BTH_BYTES     +
	+         IB_DETH_BYTES    +
	+         payload_bytes    +
	+         4                + /* ICRC     */
	+         3) & ~3);          /* round up */
	+  header->grh.next_header     = 0x1b;
	+ }
	+
	+ if (header->immediate_present)
	+  header->bth.opcode           =
IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
	+ else
	+  header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
	+ header->bth.pad_count                =(u8) ((4 -
payload_bytes) & 3);
	+ header->bth.transport_header_version = 0;
	+}
	+
	+
	+
	+/**
	+ * rdmaoe_ud_header_pack - Pack UD header struct into eth wire
format
	+ * @header:UD header struct
	+ * @buf:Buffer to pack into
	+ *
	+ * ib_ud_header_pack() packs the UD header structure @header
into wire
	+ * format in the buffer @buf.
	+ */
	+int rdmaoe_ud_header_pack(struct eth_ud_header *header,
	+         void                 *buf)
	+{
	+ int len = 0;
	+
	+ ib_pack(eth_table, ARRAY_SIZE(eth_table),
	+  &header->eth, buf);
	+ len += IB_ETH_BYTES;
	+
	+ if (header->grh_present) {
	+  ib_pack(grh_table, ARRAY_SIZE(grh_table),
	+   &header->grh, (u8*)buf + len);
	+  len += IB_GRH_BYTES;
	+ }
	+
	+ ib_pack(bth_table, ARRAY_SIZE(bth_table),
	+  &header->bth, (u8*)buf + len);
	+ len += IB_BTH_BYTES;
	+
	+ ib_pack(deth_table, ARRAY_SIZE(deth_table),
	+  &header->deth, (u8*)buf + len);
	+ len += IB_DETH_BYTES;
	+
	+ if (header->immediate_present) {
	+  memcpy((u8*)buf + len, &header->immediate_data,
	+         sizeof header->immediate_data);
	+  len += sizeof header->immediate_data;
	+ }
	+
	+ return len;
	+}
	+
	+
	Index: hw/mlx4/kernel/bus/core/verbs.c
	
===================================================================
	--- hw/mlx4/kernel/bus/core/verbs.c (revision 2617)
	+++ hw/mlx4/kernel/bus/core/verbs.c (working copy)
	@@ -336,3 +336,28 @@
	 }
	 EXPORT_SYMBOL(ib_destroy_ah);
	 
	+enum rdma_transport_type
	+rdma_node_get_transport(enum rdma_node_type node_type)
	+{
	+ switch (node_type) {
	+ case RDMA_NODE_IB_CA:
	+ case RDMA_NODE_IB_SWITCH:
	+ case RDMA_NODE_IB_ROUTER:
	+  return RDMA_TRANSPORT_IB;
	+ case RDMA_NODE_RNIC:
	+  return RDMA_TRANSPORT_IWARP;
	+ default:
	+  ASSERT(FALSE);
	+  return 0;
	+ }
	+}
	+
	+enum rdma_transport_type rdma_port_get_transport(struct
ib_device *device,
	+       u8 port_num)
	+{
	+ return device->get_port_transport ?
	+  device->get_port_transport(device, port_num) :
	+  rdma_node_get_transport(device->node_type);
	+}
	+EXPORT_SYMBOL(rdma_port_get_transport);
	+
	Index: hw/mlx4/kernel/bus/drv/drv.c
	
===================================================================
	--- hw/mlx4/kernel/bus/drv/drv.c (revision 2617)
	+++ hw/mlx4/kernel/bus/drv/drv.c (working copy)
	@@ -95,7 +95,6 @@
	 
	 #endif
	 
	-static 
	 NTSTATUS
	 __create_child(
	  __in WDFDEVICE  Device,
	@@ -228,13 +227,21 @@
	 
	  if ( p_fdo->children_created )
	   goto end;
	- 
	+
	  // eventually we'll have all information about children in
Registry
	  // DriverEntry will read it into a Global storage and
	  // this routine will create all the children on base on this
info
	  number_of_ib_ports = mlx4_count_ib_ports(mdev);
	  ASSERT(number_of_ib_ports >=0 && number_of_ib_ports <=2);
	 
	+ //For now we it's either IB or ETH, and we always create LLE
if it's ETH
	+ if((number_of_ib_ports > 0) && (mdev->caps.port_type[1] ==
MLX4_PORT_TYPE_IB) ) {
	+  status = __create_child(Device, BUS_HARDWARE_IDS,
BUS_HARDWARE_DESCRIPTION, 0 );
	+  if (!NT_SUCCESS(status)) {
	+    MLX4_PRINT_EV(TRACE_LEVEL_ERROR, MLX4_DBG_DRV,
("__create_child (ib)failed with 0x%x\n", status));
	+  }
	+ }
	+
	  for (i = 1; i <= mdev->caps.num_ports; i++) {
	         if (mlx4_is_enabled_port(mdev, i)) {
	             if(mlx4_is_eth_port(mdev, i)) {
	@@ -869,6 +876,9 @@
	   goto err;
	  }
	 
	+ pdev->p_wdf_device = Device;
	+ pdev->ib_hca_created = 0;
	+
	  // start the card
	  status = __start_card(Device, p_fdo );
	  if( !NT_SUCCESS( status ) ) 
	Index: hw/mlx4/kernel/bus/drv/stat.c
	
===================================================================
	--- hw/mlx4/kernel/bus/drv/stat.c (revision 2617)
	+++ hw/mlx4/kernel/bus/drv/stat.c (working copy)
	@@ -113,7 +113,7 @@
	 void st_print_mlx_header( struct mlx4_dev *mdev, struct
mlx4_ib_sqp *sqp, struct mlx4_wqe_mlx_seg *mlx )
	 {
	  if ( mdev->pdev->p_stat_dev->flags & MLX4_MAD_TRACE_UDH )
	- __print_ud_header( mdev, &sqp->ud_header );
	+  __print_ud_header( mdev, &sqp->hdr.ib );
	  if ( mdev->pdev->p_stat_dev->flags & MLX4_MAD_TRACE_WQE )
	   __print_mlx( mdev, mlx );
	 }
	Index: hw/mlx4/kernel/bus/ib/ah.c
	
===================================================================
	--- hw/mlx4/kernel/bus/ib/ah.c (revision 2617)
	+++ hw/mlx4/kernel/bus/ib/ah.c (working copy)
	@@ -32,68 +32,199 @@
	 
	 #include "mlx4_ib.h"
	 
	-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct
ib_ah_attr *ah_attr)
	+static inline int rdma_link_local_addr(struct in6_addr *addr)
	 {
	+ if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&
	+     addr->s6_addr32[1] == 0)
	+  return 1;
	+ else
	+  return 0;
	+}
	+
	+inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
	+{
	+ memcpy(mac, &addr->s6_addr[8], 3);
	+ memcpy(mac + 3, &addr->s6_addr[13], 3);
	+ mac[0] ^= 2;   
	+}
	+
	+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
	+{
	+ return addr->s6_addr[0] == 0xff ? 1 : 0;
	+}
	+
	+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8
*mac)
	+{
	+ int i;
	+
	+ mac[0] = 0x33;
	+ mac[1] = 0x33;
	+ for (i = 2; i < 6; ++i)
	+  mac[i] = addr->s6_addr[i + 10];
	+
	+}
	+
	+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct
ib_ah_attr *ah_attr,
	+   u8 *mac, int *is_mcast)
	+{
	+ int err = 0;
	+ struct sockaddr_in6 dst;
	+
	+ UNREFERENCED_PARAMETER(dev);
	+
	+ *is_mcast = 0;
	+ memcpy(dst.sin6_addr.s6_addr, ah_attr->grh.dgid.raw,
sizeof(ah_attr->grh.dgid.raw));
	+
	+ if (rdma_link_local_addr(&dst.sin6_addr))
	+  rdma_get_ll_mac(&dst.sin6_addr, mac);
	+ else if (rdma_is_multicast_addr(&dst.sin6_addr)) {
	+  rdma_get_mcast_mac(&dst.sin6_addr, mac);
	+  *is_mcast = 1;
	+ } else {
	+  err = -EINVAL; //jyang:todo
	+  ASSERT(FALSE);
	+ }
	+ return err;
	+}
	+
	+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct
ib_ah_attr *ah_attr,
	+      struct mlx4_ib_ah *ah)
	+{
	  struct mlx4_dev *dev = to_mdev(pd->device)->dev;
	- struct mlx4_ib_ah *ah;
	 
	  if (mlx4_is_barred(pd->device->dma_device))
	   return ERR_PTR(-EFAULT);
	 
	- ah = kmalloc(sizeof *ah, GFP_ATOMIC);
	- if (!ah)
	-  return ERR_PTR(-ENOMEM);
	 
	- memset(&ah->av, 0, sizeof ah->av);
	-
	- ah->av.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
(ah_attr->port_num << 24));
	- ah->av.g_slid  = ah_attr->src_path_bits;
	- ah->av.dlid    = cpu_to_be16(ah_attr->dlid);
	- if (ah_attr->static_rate) {
	-  ah->av.stat_rate = ah_attr->static_rate +
MLX4_STAT_RATE_OFFSET;
	-  while (ah->av.stat_rate > IB_RATE_2_5_GBPS +
MLX4_STAT_RATE_OFFSET &&
	-         !(1 << ah->av.stat_rate &
dev->caps.stat_rate_support))
	-   --ah->av.stat_rate;
	- }
	- ah->av.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
	+ ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
(ah_attr->port_num << 24));
	+ ah->av.ib.g_slid  = ah_attr->src_path_bits;
	  if (ah_attr->ah_flags & IB_AH_GRH) {
	-  ah->av.g_slid   |= 0x80;
	-  ah->av.gid_index = ah_attr->grh.sgid_index;
	-  ah->av.hop_limit = ah_attr->grh.hop_limit;
	-  ah->av.sl_tclass_flowlabel |=
	+  ah->av.ib.g_slid   |= 0x80;
	+  ah->av.ib.gid_index = ah_attr->grh.sgid_index;
	+  ah->av.ib.hop_limit = ah_attr->grh.hop_limit;
	+  ah->av.ib.sl_tclass_flowlabel |=
	    cpu_to_be32((ah_attr->grh.traffic_class << 20) |
	         ah_attr->grh.flow_label);
	-  memcpy(ah->av.dgid, ah_attr->grh.dgid.raw, 16);
	+  memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
	  }
	 
	+ ah->av.ib.dlid    = cpu_to_be16(ah_attr->dlid);
	+ if (ah_attr->static_rate) {
	+  ah->av.ib.stat_rate = ah_attr->static_rate +
MLX4_STAT_RATE_OFFSET;
	+  while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS +
MLX4_STAT_RATE_OFFSET &&
	+         !(1 << ah->av.ib.stat_rate &
dev->caps.stat_rate_support))
	+   --ah->av.ib.stat_rate;
	+ }
	+ ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl <<
28);
	+
	  return &ah->ibah;
	 }
	 
	+struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct
ib_ah_attr *ah_attr,
	+       struct mlx4_ib_ah *ah)
	+{
	+ struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
	+ struct mlx4_dev *dev = ibdev->dev;
	+ u8 mac[6];
	+ int err;
	+ int is_mcast;
	+
	+ if (mlx4_is_barred(pd->device->dma_device))
	+  return ERR_PTR(-EFAULT);
	+
	+ err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast);
	+ if (err)
	+  return ERR_PTR(err);
	+
	+ memcpy(ah->av.eth.mac_0_1, mac, 2);
	+ memcpy(ah->av.eth.mac_2_5, mac + 2, 4);
	+ ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
(ah_attr->port_num << 24));
	+ ah->av.ib.g_slid = 0x80;
	+ if (ah_attr->static_rate) {
	+  ah->av.ib.stat_rate = ah_attr->static_rate +
MLX4_STAT_RATE_OFFSET;
	+  while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS +
MLX4_STAT_RATE_OFFSET &&
	+         !(1 << ah->av.ib.stat_rate &
dev->caps.stat_rate_support))
	+   --ah->av.ib.stat_rate;
	+ }
	+
	+ /*
	+  * HW requires multicast LID so we just choose one.
	+  */
	+ if (is_mcast)
	+  ah->av.ib.dlid = cpu_to_be16(0xc000);
	+
	+ memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
	+ ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl <<
28);
	+
	+ return &ah->ibah;
	+}
	+
	+
	+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct
ib_ah_attr *ah_attr)
	+{
	+ struct mlx4_ib_ah *ah;
	+ enum rdma_transport_type transport;
	+
	+ struct ib_ah *ret;
	+
	+ ah = kzalloc(sizeof *ah, GFP_ATOMIC);
	+ if (!ah)
	+  return ERR_PTR(-ENOMEM);
	+
	+ transport = rdma_port_get_transport(pd->device,
ah_attr->port_num);
	+ if (transport == RDMA_TRANSPORT_RDMAOE) {
	+  if (!(ah_attr->ah_flags & IB_AH_GRH)) {
	+   ret = ERR_PTR(-EINVAL);
	+   goto out;
	+  } else {
	+   /* TBD: need to handle the case when we get called
	+   in an atomic context and there we might sleep. We
	+   don't expect this currently since we're working with
	+   link local addresses which we can translate without
	+   going to sleep */
	+   ret = create_rdmaoe_ah(pd, ah_attr, ah);
	+   if (IS_ERR(ret))
	+    goto out;
	+   else
	+    return ret;
	+  }
	+ } else
	+  return create_ib_ah(pd, ah_attr, ah); /* never fails */
	+
	+out:
	+ kfree(ah);
	+ return ret;
	+}
	+
	+
	 int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr
*ah_attr)
	 {
	  struct mlx4_ib_ah *ah = to_mah(ibah);
	+ enum rdma_transport_type transport;
	 
	+ transport = rdma_port_get_transport(ibah->device,
ah_attr->port_num);
	+
	  if (mlx4_is_barred(ibah->device->dma_device))
	   return -EFAULT;
	 
	  memset(ah_attr, 0, sizeof *ah_attr);
	- ah_attr->dlid        = be16_to_cpu(ah->av.dlid);
	- ah_attr->sl        =
(u8)(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28);
	- ah_attr->port_num      = (u8)(be32_to_cpu(ah->av.port_pd) >>
24);
	- if (ah->av.stat_rate)
	-  ah_attr->static_rate = ah->av.stat_rate -
MLX4_STAT_RATE_OFFSET;
	- ah_attr->src_path_bits = ah->av.g_slid & 0x7F;
	+ ah_attr->dlid        = transport == RDMA_TRANSPORT_IB ?
be16_to_cpu(ah->av.ib.dlid) : 0;
	+ ah_attr->sl        =
(u8)(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28);
	+ ah_attr->port_num      = (u8)(be32_to_cpu(ah->av.ib.port_pd)
>> 24);
	+ if (ah->av.ib.stat_rate)
	+  ah_attr->static_rate = ah->av.ib.stat_rate -
MLX4_STAT_RATE_OFFSET;
	+ ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F;
	 
	  if (mlx4_ib_ah_grh_present(ah)) {
	   ah_attr->ah_flags = IB_AH_GRH;
	 
	   ah_attr->grh.traffic_class =
	-   (u8)(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20);
	+   (u8)(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20);
	   ah_attr->grh.flow_label =
	-   be32_to_cpu(ah->av.sl_tclass_flowlabel) & 0xfffff;
	-  ah_attr->grh.hop_limit  = ah->av.hop_limit;
	-  ah_attr->grh.sgid_index = ah->av.gid_index;
	-  memcpy(ah_attr->grh.dgid.raw, ah->av.dgid, 16);
	+   be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff;
	+  ah_attr->grh.hop_limit  = ah->av.ib.hop_limit;
	+  ah_attr->grh.sgid_index = ah->av.ib.gid_index;
	+  memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16);
	  }
	 
	  return 0;
	@@ -108,7 +239,7 @@
	 // Leo: temporary 
	 int mlx4_ib_modify_ah( struct ib_ah *ibah, struct ib_ah_attr
*ah_attr )
	 {
	- struct mlx4_av *av  = &to_mah(ibah)->av;
	+ struct mlx4_av *av  = &to_mah(ibah)->av.ib;
	  struct mlx4_dev *dev = to_mdev(ibah->pd->device)->dev;
	 
	  if (mlx4_is_barred(dev))
	Index: hw/mlx4/kernel/bus/ib/main.c
	
===================================================================
	--- hw/mlx4/kernel/bus/ib/main.c (revision 2617)
	+++ hw/mlx4/kernel/bus/ib/main.c (working copy)
	@@ -133,31 +133,21 @@
	  return err;
	 }
	 
	-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
	-         struct ib_port_attr *props)
	+
	+static enum rdma_transport_type
	+mlx4_ib_port_get_transport(struct ib_device *device, u8
port_num)
	 {
	- struct ib_smp *in_mad  = NULL;
	- struct ib_smp *out_mad = NULL;
	- int err = -ENOMEM;
	+ struct mlx4_dev *dev = to_mdev(device)->dev;
	 
	- if (mlx4_is_barred(ibdev->dma_device))
	-  return -EFAULT;
	- 
	- in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
	- out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
	- if (!in_mad || !out_mad)
	-  goto out;
	+ return dev->caps.port_mask & (1 << (port_num - 1)) ?
	+  RDMA_TRANSPORT_IB : RDMA_TRANSPORT_RDMAOE;
	+}
	 
	- memset(props, 0, sizeof *props);
	 
	- init_query_mad(in_mad);
	- in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
	- in_mad->attr_mod = cpu_to_be32(port);
	-
	- err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL,
in_mad, out_mad);
	- if (err)
	-  goto out;
	-
	+static void ib_link_query_port(struct ib_device *ibdev, u8
port,
	+          struct ib_port_attr *props,
	+          struct ib_smp *out_mad)
	+{
	  props->lid  = be16_to_cpup((__be16 *) (out_mad->data + 16));
	  props->lmc  = out_mad->data[34] & 0x7;
	  props->sm_lid  = be16_to_cpup((__be16 *) (out_mad->data +
18));
	@@ -177,7 +167,64 @@
	  props->subnet_timeout = out_mad->data[51] & 0x1f;
	  props->max_vl_num = out_mad->data[37] >> 4;
	  props->init_type_reply = out_mad->data[41] >> 4;
	+ props->transport= RDMA_TRANSPORT_IB;
	+}
	 
	+static void eth_link_query_port(struct ib_device *ibdev, u8
port,
	+    struct ib_port_attr *props,
	+    struct ib_smp *out_mad)
	+{
	+
	+ props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data
+ 20));
	+ props->gid_tbl_len =
to_mdev(ibdev)->dev->caps.gid_table_len[port];
	+ props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz;
	+ props->pkey_tbl_len =
(u16)to_mdev(ibdev)->dev->caps.pkey_table_len[port];
	+ props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data
+ 46));
	+ props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data
+ 48));
	+ props->active_width = out_mad->data[31] & 0xf;
	+ props->active_speed = out_mad->data[35] >> 4;
	+ props->max_mtu  = out_mad->data[41] & 0xf;
	+ //props->active_mtu = rdmaoe->mtu[port - 1];
	+ props->active_mtu = 1500; //jyang:hardcoded
	+ props->subnet_timeout = out_mad->data[51] & 0x1f;
	+ props->max_vl_num = out_mad->data[37] >> 4;
	+ props->init_type_reply = out_mad->data[41] >> 4;
	+ props->transport= RDMA_TRANSPORT_RDMAOE;
	+
	+ //props->state  = netif_running(ndev) &&  netif_oper_up(ndev)
?
	+ //    IB_PORT_ACTIVE : IB_PORT_DOWN;
	+ props->state  = IB_PORT_ACTIVE; //jyang: just hardcoded it now
	+ props->phys_state = props->state;
	+}
	+
	+
	+
	+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
	+             struct ib_port_attr *props)
	+{
	+ struct ib_smp *in_mad  = NULL;
	+ struct ib_smp *out_mad = NULL;
	+ int err = -ENOMEM;
	+
	+ in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
	+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
	+ if (!in_mad || !out_mad)
	+  goto out;
	+
	+ memset(props, 0, sizeof *props);
	+
	+ init_query_mad(in_mad);
	+ in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
	+ in_mad->attr_mod = cpu_to_be32(port);
	+
	+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL,
in_mad, out_mad);
	+ if (err)
	+  goto out;
	+
	+ mlx4_ib_port_get_transport(ibdev, port) == RDMA_TRANSPORT_IB ?
	+  ib_link_query_port(ibdev, port, props, out_mad) :
	+  eth_link_query_port(ibdev, port, props, out_mad);
	+
	 out:
	  kfree(in_mad);
	  kfree(out_mad);
	@@ -522,6 +569,7 @@
	  ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
	  ibdev->ib_dev.query_device = mlx4_ib_query_device;
	  ibdev->ib_dev.query_port = mlx4_ib_query_port;
	+ ibdev->ib_dev.get_port_transport = mlx4_ib_port_get_transport;
	  ibdev->ib_dev.query_gid_chunk = mlx4_ib_query_gid_chunk;
	  ibdev->ib_dev.query_pkey_chunk = mlx4_ib_query_pkey_chunk;
	  ibdev->ib_dev.modify_device = mlx4_ib_modify_device;
	Index: hw/mlx4/kernel/bus/ib/mlx4_ib.h
	
===================================================================
	--- hw/mlx4/kernel/bus/ib/mlx4_ib.h (revision 2617)
	+++ hw/mlx4/kernel/bus/ib/mlx4_ib.h (working copy)
	@@ -165,14 +165,15 @@
	 
	 struct mlx4_ib_ah {
	  struct ib_ah  ibah;
	- struct mlx4_av  av;
	+ union mlx4_ext_av   av;
	 };
	 
	+
	 enum {
	  /*
	   * Largest possible UD header: send with GRH and immediate
data.
	   */
	- MLX4_IB_UD_HEADER_SIZE  = 72
	+ MLX4_IB_UD_HEADER_SIZE  = 76
	 };
	 
	 struct mlx4_ib_sqp {
	@@ -180,7 +181,10 @@
	  int   pkey_index;
	  u32   qkey;
	  u32   send_psn;
	- struct ib_ud_header ud_header;
	+ union {
	+  struct ib_ud_header ib;
	+  struct eth_ud_header eth;
	+ } hdr;
	  u8   header_buf[MLX4_IB_UD_HEADER_SIZE];
	 };
	 
	@@ -340,9 +344,14 @@
	 int __init mlx4_ib_init(void);
	 void __exit mlx4_ib_cleanup(void);
	 
	+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct
ib_ah_attr *ah_attr,
	+   u8 *mac, int *is_mcast);
	+
	+
	 static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
	 {
	- return !!(ah->av.g_slid & 0x80);
	+ return !!(ah->av.ib.g_slid & 0x80);
	+
	 }
	 
	 #endif /* MLX4_IB_H */
	Index: hw/mlx4/kernel/bus/ib/qp.c
	
===================================================================
	--- hw/mlx4/kernel/bus/ib/qp.c (revision 2617)
	+++ hw/mlx4/kernel/bus/ib/qp.c (working copy)
	@@ -46,10 +46,16 @@
	 
	 enum {
	  MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
	- MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f
	+ MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
	+ MLX4_IB_LINK_TYPE_IB  = 0,
	+ MLX4_IB_LINK_TYPE_ETH  = 1
	 };
	 
	 enum {
	+ MLX4_RDMAOE_ETHERTYPE = 0x8915
	+};
	+
	+enum {
	  MLX4_IB_MIN_SQ_STRIDE = 6
	 };
	 
	@@ -65,6 +71,8 @@
	  __constant_cpu_to_be32(MLX4_OPCODE_NOP)    /* [IB_WR_NOP]
*/
	 };
	 
	+extern inline void rdma_get_ll_mac(struct in6_addr *addr, u8
*mac);
	+
	 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
	 {
	  return container_of(mqp, struct mlx4_ib_sqp, qp);
	@@ -724,6 +732,12 @@
	 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct
ib_ah_attr *ah,
	     struct mlx4_qp_path *path, u8 port)
	 {
	+ int err;
	+ int is_eth = rdma_port_get_transport(&dev->ib_dev, port) ==
	+  RDMA_TRANSPORT_RDMAOE ? 1 : 0;
	+ u8 mac[6];
	+ int is_mcast;
	+
	  path->grh_mylmc     = ah->src_path_bits & 0x7f;
	  path->rlid     = cpu_to_be16(ah->dlid);
	  if (ah->static_rate) {
	@@ -754,7 +768,21 @@
	  path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
	   ((port - 1) << 6) | ((ah->sl & 0xf) << 2);
	 
	- return 0;
	+ if (is_eth) {
	+  if (!(ah->ah_flags & IB_AH_GRH))
	+   return -1;
	+
	+  err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast);
	+  if (err)
	+   return err;
	+
	+  memcpy(path->dmac, mac, 6);
	+  path->ackto = MLX4_IB_LINK_TYPE_ETH;
	+  /* use index 0 into MAC table for RDMAoE */
	+  path->grh_mylmc &= 0x80;
	+ }
	+
	+    return 0;
	 }
	 
	 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
	@@ -1146,79 +1174,132 @@
	  return opcode;
	 }
	 
	+
	+
	+
	 static int build_mlx_header(struct mlx4_ib_sqp *sqp,
ib_send_wr_t *wr,
	-       void *wqe)
	+       void *wqe, unsigned *mlx_seg_len)
	 {
	  enum ib_wr_opcode opcode = to_wr_opcode(wr);
	  struct ib_device *ib_dev =
&to_mdev(sqp->qp.ibqp.device)->ib_dev;
	  struct mlx4_wqe_mlx_seg *mlx = wqe;
	  struct mlx4_wqe_inline_seg *inl = (void*)((u8*)wqe + sizeof
*mlx);
	  struct mlx4_ib_ah *ah = to_mah((struct ib_ah
*)wr->dgrm.ud.h_av);
	- __be16 pkey;
	+ u16 pkey;
	  int send_size;
	  int header_size;
	  int spc;
	- u32 i;
	+ u16 i;
	+ struct ib_ud_header *ib = NULL;
	+ struct eth_ud_header *eth = NULL;
	+ struct ib_unpacked_grh *grh;
	+ struct ib_unpacked_bth  *bth;
	+ struct ib_unpacked_deth *deth;
	+ u8 *tmp;
	+ u8 mac[6];
	 
	  send_size = 0;
	  for (i = 0; i < wr->num_ds; ++i)
	   send_size += wr->ds_array[i].length;
	 
	- ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah),
&sqp->ud_header);
	+ if (rdma_port_get_transport(sqp->qp.ibqp.device, sqp->qp.port)
== RDMA_TRANSPORT_IB) {
	 
	- sqp->ud_header.lrh.service_level   =
	-  (u8)(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28);
	- sqp->ud_header.lrh.destination_lid = ah->av.dlid;
	- sqp->ud_header.lrh.source_lid      = cpu_to_be16(ah->av.g_slid
& 0x7f);
	+  ib = &sqp->hdr.ib;
	+  grh = &ib->grh;
	+  bth = &ib->bth;
	+  deth = &ib->deth;
	+  ib_ud_header_init(send_size, mlx4_ib_ah_grh_present(ah), ib);
	+  ib->lrh.service_level   =
	+   (u8)(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28);
	+  ib->lrh.destination_lid = ah->av.ib.dlid;
	+  ib->lrh.source_lid      = cpu_to_be16(ah->av.ib.g_slid &
0x7f);
	+ } else {
	+  eth = &sqp->hdr.eth;
	+  grh = &eth->grh;
	+  bth = &eth->bth;
	+  deth = &eth->deth;
	+  ib_rdmaoe_ud_header_init(send_size,
mlx4_ib_ah_grh_present(ah), eth);
	+ }
	+
	+ 
	  if (mlx4_ib_ah_grh_present(ah)) {
	-  sqp->ud_header.grh.traffic_class =
	-   (u8)((be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) &
0xff);
	-  sqp->ud_header.grh.flow_label    =
	-   ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
	-  sqp->ud_header.grh.hop_limit     = ah->av.hop_limit;
	-  ib_get_cached_gid(ib_dev, (u8)(be32_to_cpu(ah->av.port_pd) >>
24),
	-      ah->av.gid_index, &sqp->ud_header.grh.source_gid);
	-  memcpy(sqp->ud_header.grh.destination_gid.raw,
	-         ah->av.dgid, 16);
	+  grh->traffic_class =
	+   (u8)((be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) &
0xff);
	+  grh->flow_label    =
	+   ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
	+  grh->hop_limit     = ah->av.ib.hop_limit;
	+  ib_get_cached_gid(ib_dev, (u8)(be32_to_cpu(ah->av.ib.port_pd)
>> 24),
	+      ah->av.ib.gid_index, &grh->source_gid);
	+  memcpy(grh->destination_gid.raw,
	+      ah->av.ib.dgid, 16);
	  }
	 
	  mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
	- mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ?
MLX4_WQE_MLX_VL15 : 0) |
	-      (sqp->ud_header.lrh.destination_lid ==
	-       XIB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
	-      (sqp->ud_header.lrh.service_level << 8));
	- mlx->rlid   = sqp->ud_header.lrh.destination_lid;
	 
	+ if (ib) {
	+  mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ?
MLX4_WQE_MLX_VL15 : 0) |
	+       (ib->lrh.destination_lid ==
	+        IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
	+       (ib->lrh.service_level << 8));
	+  mlx->rlid   = ib->lrh.destination_lid;
	+
	+ }
	+
	  switch (opcode) {
	  case IB_WR_SEND:
	-  sqp->ud_header.bth.opcode  = IB_OPCODE_UD_SEND_ONLY;
	-  sqp->ud_header.immediate_present = 0;
	+  bth->opcode  = IB_OPCODE_UD_SEND_ONLY;
	+  if (ib)
	+   ib->immediate_present = 0;
	+  else
	+   eth->immediate_present = 0;
	   break;
	  case IB_WR_SEND_WITH_IMM:
	-  sqp->ud_header.bth.opcode  =
IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
	-  sqp->ud_header.immediate_present = 1;
	-  sqp->ud_header.immediate_data    = wr->immediate_data;
	+  bth->opcode  = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
	+  if (ib) {
	+   ib->immediate_present = 1;
	+   ib->immediate_data    = wr->immediate_data;
	+  } else {
	+   eth->immediate_present = 1;
	+   eth->immediate_data    = wr->immediate_data;
	+  }
	   break;
	  default:
	   return -EINVAL;
	  }
	 
	- sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15
: 0;
	- if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
	-  sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
	- sqp->ud_header.bth.solicited_event = (u8)(!!(wr->send_opt &
IB_SEND_OPT_SOLICITED));
	+ if (ib) {
	+  ib->lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
	+  if (ib->lrh.destination_lid == IB_LID_PERMISSIVE)
	+   ib->lrh.source_lid = IB_LID_PERMISSIVE;
	+ } else {
	+  memcpy(eth->eth.dmac_h, ah->av.eth.mac_0_1, 2);
	+  memcpy(eth->eth.dmac_h + 2, ah->av.eth.mac_2_5, 2);
	+  memcpy(eth->eth.dmac_l, ah->av.eth.mac_2_5 + 2, 2);
	+  rdma_get_ll_mac((struct in6_addr *)&grh->source_gid, mac);
	+
	+  tmp = mac;
	+  memcpy(eth->eth.smac_h, tmp, 2);
	+  memcpy(eth->eth.smac_l, tmp + 2, 4);
	+  eth->eth.type = cpu_to_be16(MLX4_RDMAOE_ETHERTYPE);
	+ }
	+
	+ bth->solicited_event = (u8)(!!(wr->send_opt &
IB_SEND_SOLICITED));
	+
	  if (!sqp->qp.ibqp.qp_num)
	   ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index,
&pkey);
	  else
	   ib_get_cached_pkey(ib_dev, sqp->qp.port,
wr->dgrm.ud.pkey_index, &pkey);
	- sqp->ud_header.bth.pkey = pkey;
	- sqp->ud_header.bth.destination_qpn = wr->dgrm.ud.remote_qp;
	- sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1
<< 24) - 1));
	- sqp->ud_header.deth.qkey = wr->dgrm.ud.remote_qkey &
0x00000080 ?
	-  cpu_to_be32(sqp->qkey) : wr->dgrm.ud.remote_qkey;
	- sqp->ud_header.deth.source_qpn =
cpu_to_be32(sqp->qp.ibqp.qp_num);
	+ bth->pkey = pkey;
	+ bth->destination_qpn = wr->dgrm.ud.remote_qp;
	+ bth->psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
	+ deth->qkey = wr->dgrm.ud.remote_qkey & 0x80000000 ?
	+         cpu_to_be32(sqp->qkey) : wr->dgrm.ud.remote_qkey;
	+ deth->source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
	 
	- header_size = ib_ud_header_pack(&sqp->ud_header,
sqp->header_buf);
	+ if (ib)
	+  header_size = ib_ud_header_pack(ib, sqp->header_buf);
	+ else
	+  header_size = rdmaoe_ud_header_pack(eth, sqp->header_buf);
	 
	 #if 0
	  {
	@@ -1271,7 +1352,10 @@
	   i = 2;
	  }
	 
	- return ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) +
header_size, 16);
	+ *mlx_seg_len =
	+  ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size,
16);
	+ return 0;
	+
	 }
	 
	 static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq,
struct ib_cq *ib_cq)
	@@ -1314,9 +1398,13 @@
	 static void set_datagram_seg(struct mlx4_wqe_datagram_seg
*dseg,
	         ib_send_wr_t *wr)
	 {
	+
	  memcpy(dseg->av, &to_mah((struct ib_ah
*)wr->dgrm.ud.h_av)->av, sizeof (struct mlx4_av));
	  dseg->dqpn = wr->dgrm.ud.remote_qp;
	  dseg->qkey = wr->dgrm.ud.remote_qkey;
	+ dseg->vlan = to_mah((struct ib_ah
*)wr->dgrm.ud.h_av)->av.eth.vlan;
	+ memcpy(dseg->mac_0_1, to_mah((struct ib_ah
*)wr->dgrm.ud.h_av)->av.eth.mac_0_1, 6);
	+
	 }
	 
	 static void set_mlx_icrc_seg(void *dseg)
	@@ -1398,7 +1486,7 @@
	 int mlx4_ib_post_send(struct ib_qp *ibqp, ib_send_wr_t *wr,
	         ib_send_wr_t **bad_wr)
	 {
	- enum ib_wr_opcode opcode;
	+ enum ib_wr_opcode opcode;// = to_wr_opcode(wr);
	  struct mlx4_ib_qp *qp = to_mqp(ibqp);
	  struct mlx4_dev *dev = to_mdev(ibqp->device)->dev;
	  u8 *wqe /*, *wqe_start*/;
	@@ -1525,16 +1613,14 @@
	 
	   case IB_QPT_SMI:
	   case IB_QPT_GSI:
	-   err = build_mlx_header(to_msqp(qp), wr, ctrl);
	+   err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
	    if (err < 0) {
	     if (bad_wr)
	      *bad_wr = wr;
	     goto out;
	    }
	-   
	-   wqe  += err;
	-   size += err / 16;
	-
	+   wqe  += seglen;
	+   size += seglen / 16;
	    err = 0;
	    break;
	 
	Index: hw/mlx4/kernel/bus/inc/cmd.h
	
===================================================================
	--- hw/mlx4/kernel/bus/inc/cmd.h (revision 2617)
	+++ hw/mlx4/kernel/bus/inc/cmd.h (working copy)
	@@ -138,6 +138,7 @@
	  MLX4_SET_PORT_MAC_TABLE = 0x2,
	  MLX4_SET_PORT_VLAN_TABLE = 0x3,
	  MLX4_SET_PORT_PRIO_MAP  = 0x4,
	+ MLX4_SET_PORT_GID_TABLE = 0x5,
	 };
	 
	 struct mlx4_dev;
	Index: hw/mlx4/kernel/bus/inc/device.h
	
===================================================================
	--- hw/mlx4/kernel/bus/inc/device.h (revision 2617)
	+++ hw/mlx4/kernel/bus/inc/device.h (working copy)
	@@ -208,8 +208,9 @@
	  int   log_num_prios;
	  int   num_fc_exch;
	  enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
	-    enum mlx4_port_state port_state[MLX4_MAX_PORTS + 1];
	-    int   reserved_fexch_mpts_base;   
	+ u32   port_mask;
	+ enum mlx4_port_state port_state[MLX4_MAX_PORTS + 1];
	+ int   reserved_fexch_mpts_base;   
	  int   total_reserved_qps;
	 };
	 
	@@ -343,6 +344,28 @@
	  u8   dgid[16];
	 };
	 
	+struct mlx4_eth_av {
	+ __be32  port_pd;
	+ u8  reserved1;
	+ u8  smac_idx;
	+ u16  reserved2;
	+ u8  reserved3;
	+ u8  gid_index;
	+ u8  stat_rate;
	+ u8  hop_limit;
	+ __be32  sl_tclass_flowlabel;
	+ u8  dgid[16];
	+ u32  reserved4[2];
	+ __be16  vlan;
	+ u8  mac_0_1[2];
	+ u8  mac_2_5[4];
	+};
	+
	+union mlx4_ext_av {
	+ struct mlx4_av  ib;
	+ struct mlx4_eth_av eth;
	+};
	+
	 #define MLX4_DEV_SIGNATURE 0xf1b34a6e
	 
	 struct mlx4_dev_params {
	Index: hw/mlx4/kernel/bus/inc/ib_pack.h
	
===================================================================
	--- hw/mlx4/kernel/bus/inc/ib_pack.h (revision 2617)
	+++ hw/mlx4/kernel/bus/inc/ib_pack.h (working copy)
	@@ -39,6 +39,7 @@
	 
	 enum {
	  IB_LRH_BYTES  = 8,
	+ IB_ETH_BYTES  = 14,
	  IB_GRH_BYTES  = 40,
	  IB_BTH_BYTES  = 12,
	  IB_DETH_BYTES = 8
	@@ -212,6 +213,15 @@
	  __be32       source_qpn;
	 };
	 
	+struct ib_unpacked_eth {
	+ u8 dmac_h[4];
	+ u8 dmac_l[2];
	+ u8 smac_h[2];
	+ u8 smac_l[4];
	+ __be16 type;
	+};
	+
	+
	 struct ib_ud_header {
	  struct ib_unpacked_lrh  lrh;
	  int                     grh_present;
	@@ -222,6 +232,19 @@
	  __be32           immediate_data;
	 };
	 
	+
	+
	+struct eth_ud_header {
	+ struct ib_unpacked_eth  eth;
	+ int                     grh_present;
	+ struct ib_unpacked_grh  grh;
	+ struct ib_unpacked_bth  bth;
	+ struct ib_unpacked_deth deth;
	+ int              immediate_present;
	+ __be32           immediate_data;
	+};
	+
	+
	 void ib_pack(const struct ib_field        *desc,
	       int                           desc_len,
	       void                         *structure,
	@@ -236,10 +259,18 @@
	          int         grh_present,
	          struct ib_ud_header *header);
	 
	+void ib_rdmaoe_ud_header_init(int          payload_bytes,
	+      int         grh_present,
	+      struct eth_ud_header   *header);
	+
	 int ib_ud_header_pack(struct ib_ud_header *header,
	         void                *buf);
	 
	 int ib_ud_header_unpack(void                *buf,
	    struct ib_ud_header *header);
	 
	+int rdmaoe_ud_header_pack(struct eth_ud_header *header,
	+         void                 *buf);
	+
	+
	 #endif /* IB_PACK_H */
	Index: hw/mlx4/kernel/bus/inc/ib_verbs.h
	
===================================================================
	--- hw/mlx4/kernel/bus/inc/ib_verbs.h (revision 2617)
	+++ hw/mlx4/kernel/bus/inc/ib_verbs.h (working copy)
	@@ -53,6 +53,34 @@
	 
	 #include "ib_verbs_ex.h"
	 
	+/*
	+ * IPv6 address structure
	+ */
	+
	+struct in6_addr
	+{
	+ union 
	+ {
	+  __u8  u6_addr8[16];
	+  __be16  u6_addr16[8];
	+  __be32  u6_addr32[4];
	+ } in6_u;
	+#define s6_addr   in6_u.u6_addr8
	+#define s6_addr16  in6_u.u6_addr16
	+#define s6_addr32  in6_u.u6_addr32
	+};
	+
	+
	+struct sockaddr_in6 {
	+ unsigned short int sin6_family;    /* AF_INET6 */
	+ __be16   sin6_port;      /* Transport layer port # */
	+ __be32   sin6_flowinfo;  /* IPv6 flow information */
	+ struct in6_addr  sin6_addr;      /* IPv6 address */
	+ __u32   sin6_scope_id;  /* scope id (new in RFC2553) */
	+};
	+
	+#define AF_INET6 10 /* IP version 6   */
	+
	 enum rdma_node_type {
	  /* IB values map to NodeInfo:NodeType. */
	  RDMA_NODE_IB_CA  = 1,
	@@ -63,7 +91,8 @@
	 
	 enum rdma_transport_type {
	  RDMA_TRANSPORT_IB,
	- RDMA_TRANSPORT_IWARP
	+ RDMA_TRANSPORT_IWARP,
	+ RDMA_TRANSPORT_RDMAOE
	 };
	 
	 enum rdma_transport_type
	@@ -231,6 +260,7 @@
	  u8   active_width;
	  u8   active_speed;
	  u8                      phys_state;
	+ enum rdma_transport_type transport;
	 };
	 
	 enum ib_device_modify_flags {
	@@ -633,6 +663,10 @@
	  IB_WR_ATOMIC_CMP_AND_SWP,
	  IB_WR_ATOMIC_FETCH_AND_ADD,
	  IB_WR_LSO,
	+ IB_WR_SEND_WITH_INV,
	+ IB_WR_RDMA_READ_WITH_INV,
	+ IB_WR_LOCAL_INV,
	+ IB_WR_FAST_REG_MR,
	  IB_WR_NOP
	 };
	 
	@@ -920,6 +954,9 @@
	  int             (*query_port)(struct ib_device *device,
	        u8 port_num,
	        struct ib_port_attr *port_attr);
	+ enum rdma_transport_type   (*get_port_transport)(struct
ib_device *device,
	+        u8 port_num);
	+
	  int             (*query_gid_chunk)(struct ib_device *device,
	       u8 port_num, int index,
	       union ib_gid gid[8], int size);
	@@ -1127,6 +1164,11 @@
	 int ib_query_port(struct ib_device *device,
	     u8 port_num, struct ib_port_attr *port_attr);
	 
	+enum rdma_transport_type rdma_port_get_transport(struct
ib_device *device,
	+       u8 port_num);
	+int rdma_is_transport_supported(struct ib_device *device,
	+    enum rdma_transport_type transport);
	+
	 int ib_query_gid_chunk(struct ib_device *device,
	    u8 port_num, int index, union ib_gid gid[8], int size);
	 
	Index: hw/mlx4/kernel/bus/inc/qp.h
	
===================================================================
	--- hw/mlx4/kernel/bus/inc/qp.h (revision 2617)
	+++ hw/mlx4/kernel/bus/inc/qp.h (working copy)
	@@ -113,7 +113,9 @@
	  u8   snooper_flags;
	  u8   reserved3[2];
	  u8   counter_index;
	- u8   reserved4[7];
	+ u8   reserved4;
	+ u8   dmac[6];
	+
	 };
	 
	 struct mlx4_qp_context {
	@@ -213,7 +215,9 @@
	  __be32   av[8];
	  __be32   dqpn;
	  __be32   qkey;
	- __be32   reservd[2];
	+ __be16   vlan;
	+ u8   mac_0_1[2];
	+ u8   mac_2_5[4];
	 };
	 
	 #pragma warning( disable : 4200)
	Index: hw/mlx4/kernel/bus/net/main.c
	
===================================================================
	--- hw/mlx4/kernel/bus/net/main.c (revision 2617)
	+++ hw/mlx4/kernel/bus/net/main.c (working copy)
	@@ -139,7 +139,9 @@
	  int count = 0;
	 
	  for (i = 0; i < dev->caps.num_ports; i++) {
	-  if (dev->caps.port_type[i+1] == MLX4_PORT_TYPE_IB) {
	+  if ((dev->caps.port_type[i+1] == MLX4_PORT_TYPE_IB) ||
	+            (dev->caps.port_type[i+1] == MLX4_PORT_TYPE_ETH))
	+        {
	    count++;
	   }
	  }
	@@ -170,6 +172,16 @@
	  return FALSE;
	 }
	 
	+static void mlx4_set_port_mask(struct mlx4_dev *dev)
	+{
	+ int i;
	+
	+ dev->caps.port_mask = 0;
	+ for (i = 1; i <= dev->caps.num_ports; ++i)
	+  if (dev->caps.port_type[i] == MLX4_PORT_TYPE_IB)
	+   dev->caps.port_mask |= 1 << (i - 1);
	+}
	+
	 static int mlx4_dev_cap(struct mlx4_dev *dev, struct
mlx4_dev_cap *dev_cap)
	 {
	  int err;
	@@ -309,6 +321,8 @@
	    ++num_eth_ports;
	  }
	 
	+ mlx4_set_port_mask(dev);
	+
	  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] =
dev_cap->reserved_qps;
	  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
	   dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
	Index: hw/mlx4/kernel/bus/net/port.c
	
===================================================================
	--- hw/mlx4/kernel/bus/net/port.c (revision 2617)
	+++ hw/mlx4/kernel/bus/net/port.c (working copy)
	@@ -33,7 +33,9 @@
	 
	 #include "mlx4.h"
	 #include "cmd.h"
	+#include "public.h"
	 
	+extern NTSTATUS __create_child();
	 
	 void mlx4_init_mac_table(struct mlx4_dev *dev, u8 port)
	 {
	@@ -60,6 +62,10 @@
	   table->refs[i] = 0;
	  }
	  table->max = 1 << dev->caps.log_num_vlans;
	+ if(table->max > MLX4_MAX_VLAN_NUM)
	+ {
	+  table->max = MLX4_MAX_VLAN_NUM;
	+ }
	  table->total = 0;
	 }
	 
	@@ -84,6 +90,52 @@
	  return err;
	 }
	 
	+static void mlx4_addrconf_ifid_eui48_win(u8 *eui, u64 mac)
	+{
	+    u8 *p = (u8*)&mac+2; //mac 6 bytes
	+ memcpy(eui, p, 3);
	+ memcpy(eui + 5, p + 3, 3);
	+ eui[3] = 0xFF;
	+ eui[4] = 0xFE;
	+ eui[0] ^= 2;
	+}
	+
	+
	+static int update_ipv6_gids_win(struct mlx4_dev *dev, int port,
int clear, u64 mac)
	+{
	+ struct mlx4_cmd_mailbox *mailbox;
	+ union ib_gid *gids, *tmpgids;
	+ int err;
	+
	+ tmpgids = kzalloc(128 * sizeof *gids, GFP_ATOMIC);
	+ if (!tmpgids)
	+  return -ENOMEM;
	+
	+ if (!clear) {
	+  mlx4_addrconf_ifid_eui48_win(&tmpgids[0].raw[8],
cpu_to_be64(mac));
	+  tmpgids[0].global.subnet_prefix =
cpu_to_be64(0xfe80000000000000LL);
	+ }
	+
	+ mailbox = mlx4_alloc_cmd_mailbox(dev);
	+ if (IS_ERR(mailbox)) {
	+  err = PTR_ERR(mailbox);
	+  goto out;
	+ }
	+
	+ gids = mailbox->buf;
	+ memcpy(gids, tmpgids, 128 * sizeof *gids);
	+
	+ err = mlx4_cmd(dev, mailbox->dma.da, MLX4_SET_PORT_GID_TABLE
<< 8 | port,
	+         1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B);
	+
	+ mlx4_free_cmd_mailbox(dev, mailbox);
	+
	+out:
	+ kfree(tmpgids);
	+ return err;
	+}
	+
	+
	 int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac,
int *index)
	 {
	  struct mlx4_mac_table *table =
	@@ -112,7 +164,7 @@
	  }
	  mlx4_dbg(dev, "Free mac index is %d\n", free);
	 
	- if (table->total == table->max) {
	+ if (table->total == table->max || free < 0) {
	   /* No free mac entries */
	   err = -ENOSPC;
	   goto out;
	@@ -132,6 +184,20 @@
	 
	  *index = free;
	  ++table->total;
	+
	+ //update port guid with mac address
	+ update_ipv6_gids_win(dev, port, 0, mac);
	+   
	+ if(!InterlockedExchange(&dev->pdev->ib_hca_created, 1))
	+ {
	+     NTSTATUS status = STATUS_SUCCESS;
	+  status = __create_child(dev->pdev->p_wdf_device,
BUS_HARDWARE_IDS, BUS_HARDWARE_DESCRIPTION, 0 );
	+  if (!NT_SUCCESS(status)) {
	+    mlx4_err(dev, "__create_child (ib)failed with 0x%x\n",
status);
	+    dev->pdev->ib_hca_created = FALSE;
	+  }
	+ }
	+
	 out:
	  up(&table->mac_sem);
	  return err;
	@@ -207,7 +273,7 @@
	   }
	  }
	 
	- if (table->total == table->max) {
	+ if (table->total == table->max || free < 0) {
	   /* No free vlan entries */
	   err = -ENOSPC;
	   goto out;
	Index: hw/mlx4/kernel/bus/net/SOURCES
	
===================================================================
	--- hw/mlx4/kernel/bus/net/SOURCES (revision 2617)
	+++ hw/mlx4/kernel/bus/net/SOURCES (working copy)
	@@ -31,7 +31,7 @@
	  srq.c   \
	         port.c                  \
	 
	
-INCLUDES=..;..\inc;..\..\inc;..\core\$O;..\..\..\..\..\inc;..\..\..\..\
..\inc\kernel;
	
+INCLUDES=..;..\inc;..\..\inc;..\..\..\inc;..\core\$O;..\..\..\..\..\inc
;..\..\..\..\..\inc\kernel;
	 
	 C_DEFINES=$(C_DEFINES) -DDRIVER -DDEPRECATE_DDK_FUNCTIONS
-D__LITTLE_ENDIAN -DUSE_WDM_INTERRUPTS 
	 #-DFORCE_LIVEFISH
	Index: hw/mlx4/kernel/hca/av.c
	
===================================================================
	--- hw/mlx4/kernel/hca/av.c (revision 2617)
	+++ hw/mlx4/kernel/hca/av.c (working copy)
	@@ -74,6 +74,7 @@
	  p_ib_ah = p_ib_pd->device->create_ah(p_ib_pd, &ah_attr);
	  if (IS_ERR(p_ib_ah)) {
	   err = PTR_ERR(p_ib_ah);
	+  status = errno_to_iberr(err);
	   HCA_PRINT(TRACE_LEVEL_ERROR,HCA_DBG_AV ,("create_ah failed
(%d)\n", err));
	   goto err_create_ah;
	  }
	Index: hw/mlx4/kernel/hca/data.c
	
===================================================================
	--- hw/mlx4/kernel/hca/data.c (revision 2617)
	+++ hw/mlx4/kernel/hca/data.c (working copy)
	@@ -339,6 +339,7 @@
	    ibal_port_p->max_vls    = mthca_port_p->max_vl_num;
	    ibal_port_p->sm_lid     = cl_ntoh16(mthca_port_p->sm_lid);
	    ibal_port_p->sm_sl      = mthca_port_p->sm_sl;
	+   ibal_port_p->transport  = mthca_port_p->transport;
	    ibal_port_p->link_state = (mthca_port_p->state != 0) ?
(uint8_t)mthca_port_p->state : IB_LINK_DOWN;
	    ibal_port_p->num_gids   =
(uint16_t)mthca_port_p->gid_tbl_len;
	    ibal_port_p->num_pkeys  = mthca_port_p->pkey_tbl_len;
	Index: hw/mlx4/kernel/inc/l2w.h
	
===================================================================
	--- hw/mlx4/kernel/inc/l2w.h (revision 2617)
	+++ hw/mlx4/kernel/inc/l2w.h (working copy)
	@@ -185,6 +185,8 @@
	  DMA_ADAPTER  *    p_dma_adapter; /* HCA adapter object */
	  DEVICE_OBJECT *    p_self_do;  /* mlx4_bus's FDO */
	  DEVICE_OBJECT *    pdo;   /* mlx4_bus's PDO */
	+ PVOID                           p_wdf_device;   /* wdf_device
*/
	+ LONG       ib_hca_created;
	  // mlx4_ib: various objects and info 
	  struct ib_device *    ib_dev;
	  // mlx4_net: various objects and info 
	Index: inc/iba/ib_types.h
	
===================================================================
	--- inc/iba/ib_types.h (revision 2617)
	+++ inc/iba/ib_types.h (working copy)
	@@ -9419,6 +9419,8 @@
	  TO_LONG_PTR(ib_gid_t*, p_gid_table);
	  TO_LONG_PTR(ib_net16_t*,p_pkey_table);
	 
	+ enum rdma_transport_type transport;
	+
	 } ib_port_attr_t;
	 /*
	 * SEE ALSO
	Index: ulp/opensm/user/include/iba/ib_types.h
	
===================================================================
	--- ulp/opensm/user/include/iba/ib_types.h (revision 2617)
	+++ ulp/opensm/user/include/iba/ib_types.h (working copy)
	@@ -8676,6 +8676,7 @@
	  ib_gid_t    *p_gid_table;
	  ib_net16_t    *p_pkey_table;
	 
	+ enum rdma_transport_type transport;
	 } ib_port_attr_t;
	 /*
	 * SEE ALSO
	Index: ulp/opensm/user/include/iba/ib_types_extended.h
	
===================================================================
	--- ulp/opensm/user/include/iba/ib_types_extended.h (revision
2617)
	+++ ulp/opensm/user/include/iba/ib_types_extended.h (working
copy)
	@@ -586,6 +586,7 @@
	  TO_LONG_PTR(ib_gid_t*, p_gid_table);
	  TO_LONG_PTR(ib_net16_t*,p_pkey_table);
	 
	+ enum rdma_transport_type transport;
	 } ib_port_attr_t;
	 /*
	 * SEE ALSO
	

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20100121/489f1683/attachment.html>


More information about the ofw mailing list