[ofw] [IPoIB_CM] Problems with large pings

Alex Naslednikov xalex at mellanox.co.il
Mon Jan 12 11:09:18 PST 2009


Hello, Alex,
Thank you for the patch, it looks good.
I still have one remark regards (ud) payload :
   If one uses IPoIB in UD mode, he will not have the possibility now to
configure MTU to the value lower than DEFAULT_MTU.

I would like to propose 2 solutions:
1.  using 2 different parameters for UD and CM, or
2. If CM is disabled, ud payload will take the value from the registry;
otherwise, cm payload will take the value from the registry and UD will
take the maximum possible value 

XaleX

-----Original Message-----
From: Alex Estrin [mailto:alex.estrin at qlogic.com] 
Sent: Monday, January 12, 2009 8:25 PM
To: Alex Naslednikov
Cc: ofw at lists.openfabrics.org
Subject: RE: [ofw] [IPoIB_CM] Problems with large pings

Hello,

> 4. And there's yet another inconsistence:
> Not all cards support 4K MTU (in UD mode). Originally, it was possible

> to enlarge MTU to 4K by using IPoIB parameters.
> Now consider the following code:
>  
>  p_adapter->params.cm_payload_mtu =
>    min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );  
> p_adapter->params.cm_xfer_block_size =
>    p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);  
> p_adapter->params.payload_mtu =
>    min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);  
> p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) + 
> p_adapter->params.payload_mtu);
>  
> It means that if you has card that supports 4K MTU, 
> p_adapter->params.payload_mtu will nevertheless get only 2K value (and

> that's still important in UD mode)

Here is a possible solution for this issue.
Please review.

Thanks,
Alex.

Signed-off-by: Alex Estrin <alex.estrin at qlogic.com>
----
Index: kernel/ipoib_driver.c
===================================================================
--- kernel/ipoib_driver.c	(revision 1777)
+++ kernel/ipoib_driver.c	(working copy)
@@ -614,13 +614,7 @@
 				EVENT_IPOIB_CONNECTED_MODE_ERR, 1,
0xbadc0de0 );
 		}
 	}
-	p_adapter->params.cm_payload_mtu =
-			min( MAX_CM_PAYLOAD_MTU,
p_adapter->params.payload_mtu );
-	p_adapter->params.cm_xfer_block_size = 
-			p_adapter->params.cm_payload_mtu +
sizeof(eth_hdr_t);
-	p_adapter->params.payload_mtu = 
-			min( DEFAULT_PAYLOAD_MTU,
p_adapter->params.payload_mtu);
-	p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
p_adapter->params.payload_mtu);
+
 	NdisReadNetworkAddress( &status, p_mac, p_len, h_config );
 
 
@@ -735,13 +729,10 @@
 	IN				NDIS_HANDLE
h_adapter,
 	IN				NDIS_HANDLE
wrapper_config_context )
 {
-	NDIS_STATUS			status;
 	ib_api_status_t		ib_status;
 	UINT				medium_index;
 	ipoib_adapter_t		*p_adapter;
-#if IPOIB_USE_DMA
-	ULONG max_phys_mapping;
-#endif
+
 	IPOIB_ENTER( IPOIB_DBG_INIT );
 
 #ifdef _DEBUG_
@@ -783,23 +774,6 @@
 		NDIS_ATTRIBUTE_USES_SAFE_BUFFER_APIS,
 		NdisInterfacePNPBus );
 
-#if IPOIB_USE_DMA
-	max_phys_mapping = p_adapter->params.cm_enabled ? 
-		p_adapter->params.cm_xfer_block_size:
p_adapter->params.xfer_block_size;
-	max_phys_mapping = p_adapter->params.lso ? 
-		max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
max_phys_mapping;
-	status =
-		NdisMInitializeScatterGatherDma( h_adapter, TRUE,
max_phys_mapping );
-
-	if( status != NDIS_STATUS_SUCCESS )
-	{
-		ipoib_destroy_adapter( p_adapter );
-		IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-			("NdisMInitializeScatterGatherDma returned
0x%.8x.\n", status) );
-		return status;
-	}
-#endif
-
 	/* Create the adapter adapter */
 	ib_status = ipoib_start_adapter( p_adapter );
 	if( ib_status != IB_SUCCESS )
@@ -815,7 +789,7 @@
 	ipoib_ref_ibat();
 
 	IPOIB_EXIT( IPOIB_DBG_INIT );
-	return status;
+	return NDIS_STATUS_SUCCESS;
 }
 
 
Index: kernel/ipoib_port.c
===================================================================
--- kernel/ipoib_port.c	(revision 1797)
+++ kernel/ipoib_port.c	(working copy)
@@ -896,6 +896,47 @@
 			("Query CA attributes failed\n" ) );
 		return status;
 	}
+
+	if( p_port->p_adapter->params.cm_enabled )
+	{
+		uin32_t	ud_payload_mtu = __port_attr_to_mtu_size( 
+			p_port->p_ca_attrs->p_port_attr[p_port->port_num
- 1].mtu );
+		/* setup CM payload mtu */
+		p_port->p_adapter->params.cm_payload_mtu =
+			min( MAX_CM_PAYLOAD_MTU,
p_port->p_adapter->params.payload_mtu );
+		p_port->p_adapter->params.cm_xfer_block_size = 
+			p_port->p_adapter->params.cm_payload_mtu +
sizeof(eth_hdr_t);
+		
+		/* adjust ipoib UD payload MTU to actual port MTU size.
*/
+		/* TODO: should initialization fail if port mtu < 2048 ?
*/
+		p_port->p_adapter->params.payload_mtu = 
+			( max( DEFAULT_MTU, ud_payload_mtu ) -
sizeof(ipoib_hdr_t) );
+		p_port->p_adapter->params.xfer_block_size = 
+			(sizeof(eth_hdr_t) +
p_port->p_adapter->params.payload_mtu);
+	}
+#if IPOIB_USE_DMA
+	/* init DMA only once while running MiniportInitialize */
+	if ( !p_port->p_adapter->reset )
+	{
+		ULONG max_phys_mapping;
+
+		max_phys_mapping = p_port->p_adapter->params.cm_enabled
?
+			p_port->p_adapter->params.cm_xfer_block_size :
+			p_port->p_adapter->params.xfer_block_size;
+		max_phys_mapping = p_port->p_adapter->params.lso ?
+			max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
max_phys_mapping;
+		status =
+			NdisMInitializeScatterGatherDma( 
+			p_port->p_adapter->h_adapter, TRUE,
max_phys_mapping );
+		if( status != NDIS_STATUS_SUCCESS )
+		{
+			IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+				("NdisMInitializeScatterGatherDma
returned 0x%.8x.\n", status) );
+			return status;
+		}
+	}
+#endif
+
 	/* Allocate the PD. */
 	status = p_port->p_adapter->p_ifc->alloc_pd(
 		p_port->ib_mgr.h_ca, IB_PDT_UD, p_port,
&p_port->ib_mgr.h_pd ); @@ -5268,7 +5309,7 @@
 
 	IPOIB_ENTER( IPOIB_DBG_ENDPT );
 
-	/* This function must be called from the recieve path */
+	/* This function must be called from the receive path */
 	CL_ASSERT(p_port->endpt_rdr > 0);
 
 	cl_obj_lock( &p_port->obj );
@@ -5423,7 +5464,7 @@
 	/* Set global routing information. */
 	ib_path_rec_set_hop_flow_raw( p_path, hop_limit, flow_lbl, FALSE
);
 	p_path->tclass = p_port->ib_mgr.bcast_rec.tclass;
- 
+
 	cl_obj_unlock( &p_port->obj );
 
 	IPOIB_EXIT( IPOIB_DBG_ENDPT );
Index: kernel/ipoib_port.h
===================================================================
--- kernel/ipoib_port.h	(revision 1782)
+++ kernel/ipoib_port.h	(working copy)
@@ -774,4 +774,22 @@
 	IN				ipoib_port_t* const
p_port,
 	IN				PVOID
cancel_id );
 
+static inline uint32_t
+__port_attr_to_mtu_size(uint32_t value) {
+	switch (value) 
+	{
+	default:
+	case IB_MTU_LEN_2048:
+		return 2048;
+	case IB_MTU_LEN_4096:
+		return 4096;
+	case IB_MTU_LEN_1024:
+		return 1024;
+	case IB_MTU_LEN_512:
+		return  512;
+	case IB_MTU_LEN_256:
+		return  256;
+	}
+}
 #endif	/* _IPOIB_PORT_H_ */

> Hello,
> 
> Thank you for your feedback.
> 
> 1. This sturcture was used for header dereference only.
> And CM receive buffers are allocated of size  = cm_xfer_block_size.
> Please see  __cm_recv_desc_ctor() in ipoib_endpoint.c
> Could you please point me to the spot where memory corruption 
> is possible?
> 
> 2-3. Yes, this is an issue I'm aware of.
> The major the problem is,I think, that send descriptor is 
> used from the stack, but setting MAX_WRS_PER_MSG  > 16 would 
> make send descriptor too big and cause stack overflow. The 
> possible solution would be to use pool of send descriptors 
> (similar to what we use on receive side).
> 
> 4. For 4k fabrics IPoIB parameter setting sometimes is not enough.
> There are also legacy HCAs and switches can coexist on the 
> same fabric.
> However you are right, the way payload_mtu is adjusted could 
> be done better.
> I think the best way to handle this is to get actual port mtu 
> value from ca query during port initialization,
> Then adjust payload_mtu parameter accordingly.
> (also in this case NdisMInitializeScatterGatherDma should be 
> called after param is fixed). 
> 
> Thoughts?
> 
> Thanks,
> Alex. 
> 
> > -----Original Message-----
> > From: Alex Naslednikov [mailto:xalex at mellanox.co.il] 
> > Sent: Thursday, January 08, 2009 11:02 AM
> > To: Alex Estrin
> > Cc: Tzachi Dar; Ishai Rabinovitz; ofw at lists.openfabrics.org
> > Subject: [ofw] [IPoIB_CM] Problems with large pings
> > 
> > Hello, Alex,
> > I tried to test IPoIB_CM module (in connected mode, rev. 
> > 1797) and found a problem when sending large ping. 
> > Settings: MTU_SIZE == 32K (32768), Connected Mode: On, ping 
> size: 32K 
> > Below is the description of the problem and some investigations. 
> > Please, let us know your suggestions and comments regards 
> > this issue; meanwhile, we continue to debug.
> >  
> >  
> > This ping always fails and almost always leads both 
> machines to crash.
> > 1. I enlarged the size of _ipoib_pkt!data to be 
> > MAX_CM_PAYLOAD_MTU just to avoid possible memory corruption 
> > on receive side during the debug.
> > I found that this struct also used during CM operations and 
> > theoretically can receive buffers larger than 4K (Max MTU 
> size for UD)
> >  
> > typedef struct _ipoib_pkt
> > {
> >  ipoib_hdr_t  hdr;
> >  union _payload
> >  {
> > - uint8_t   data[MAX_UD_PAYLOAD_MTU];
> > + uint8_t   data[MAX_CM_PAYLOAD_MTU];
> >   ipoib_arp_pkt_t arp;
> >   ip_pkt_t  ip;
> >  
> >  } PACK_SUFFIX type;
> >  
> > } PACK_SUFFIX ipoib_pkt_t;
> >
> > 2. Now the crash was not reproduced, but ping still didn't work.
> > That's because MAX_WRS_PER_MSG was defined as 
> > MAX_CM_PAYLOAD_MTU/MAX_UD_PAYLOAD_MTU.
> > In our case, MAX_WRS_PER_MSG  was equal to 16 (62520/4092).
> > In the case of ICMP packet IPoIB_CM still uses the UD mode; 
> > i.e. it tries to fragment the message into chunks of 'payload_mtu'.
> > In our case, params.payload_mtu == 2K. That is, Ping of 32K 
> > should be fragmented into 16 chunks.
> >  
> > From the __send_fragments:
> > seg_len = ( next_sge > ( 
> > p_port->p_adapter->params.payload_mtu - wr_size ) )?
> >     ( p_port->p_adapter->params.payload_mtu - wr_size ) : next_sge;
> >  
> > 3. But there's a following check inside __send_fragments:
> >  
> >   if( wr_idx >= ( MAX_WRS_PER_MSG - 1 ) )
> >    return NDIS_STATUS_RESOURCES;
> >  
> > In our case, wr_idx always reach 15 (i.e. 16 elements), so 
> > this check fails.
> >  
> > 4. And there's yet another inconsistence:
> > Not all cards support 4K MTU (in UD mode). Originally, it was 
> > possible to enlarge MTU to 4K by using IPoIB parameters.
> > Now consider the following code:
> >  
> >  p_adapter->params.cm_payload_mtu =
> >    min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> >  p_adapter->params.cm_xfer_block_size = 
> >    p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> >  p_adapter->params.payload_mtu = 
> >    min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> >  p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) + 
> > p_adapter->params.payload_mtu);
> >  
> > It means that if you has card that supports 4K MTU, 
> > p_adapter->params.payload_mtu will nevertheless get only 2K 
> > value (and that's still important in UD mode)
> >  
> > Thanks,
> > XaleX
> >  
> > _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> 



More information about the ofw mailing list