[ofw] [IPoIB_CM] Problems with large pings
Alex Naslednikov
xalex at mellanox.co.il
Mon Jan 12 11:09:18 PST 2009
Hello, Alex,
Thank you for the patch, it looks good.
I still have one remark regards (ud) payload :
If one uses IPoIB in UD mode, he will not have the possibility now to
configure MTU to the value lower than DEFAULT_MTU.
I would like to propose 2 solutions:
1. using 2 different parameters for UD and CM, or
2. If CM is disabled, ud payload will take the value from the registry;
otherwise, cm payload will take the value from the registry and UD will
take the maximum possible value
XaleX
-----Original Message-----
From: Alex Estrin [mailto:alex.estrin at qlogic.com]
Sent: Monday, January 12, 2009 8:25 PM
To: Alex Naslednikov
Cc: ofw at lists.openfabrics.org
Subject: RE: [ofw] [IPoIB_CM] Problems with large pings
Hello,
> 4. And there's yet another inconsistence:
> Not all cards support 4K MTU (in UD mode). Originally, it was possible
> to enlarge MTU to 4K by using IPoIB parameters.
> Now consider the following code:
>
> p_adapter->params.cm_payload_mtu =
> min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> p_adapter->params.cm_xfer_block_size =
> p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> p_adapter->params.payload_mtu =
> min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> p_adapter->params.payload_mtu);
>
> It means that if you has card that supports 4K MTU,
> p_adapter->params.payload_mtu will nevertheless get only 2K value (and
> that's still important in UD mode)
Here is a possible solution for this issue.
Please review.
Thanks,
Alex.
Signed-off-by: Alex Estrin <alex.estrin at qlogic.com>
----
Index: kernel/ipoib_driver.c
===================================================================
--- kernel/ipoib_driver.c (revision 1777)
+++ kernel/ipoib_driver.c (working copy)
@@ -614,13 +614,7 @@
EVENT_IPOIB_CONNECTED_MODE_ERR, 1,
0xbadc0de0 );
}
}
- p_adapter->params.cm_payload_mtu =
- min( MAX_CM_PAYLOAD_MTU,
p_adapter->params.payload_mtu );
- p_adapter->params.cm_xfer_block_size =
- p_adapter->params.cm_payload_mtu +
sizeof(eth_hdr_t);
- p_adapter->params.payload_mtu =
- min( DEFAULT_PAYLOAD_MTU,
p_adapter->params.payload_mtu);
- p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
p_adapter->params.payload_mtu);
+
NdisReadNetworkAddress( &status, p_mac, p_len, h_config );
@@ -735,13 +729,10 @@
IN NDIS_HANDLE
h_adapter,
IN NDIS_HANDLE
wrapper_config_context )
{
- NDIS_STATUS status;
ib_api_status_t ib_status;
UINT medium_index;
ipoib_adapter_t *p_adapter;
-#if IPOIB_USE_DMA
- ULONG max_phys_mapping;
-#endif
+
IPOIB_ENTER( IPOIB_DBG_INIT );
#ifdef _DEBUG_
@@ -783,23 +774,6 @@
NDIS_ATTRIBUTE_USES_SAFE_BUFFER_APIS,
NdisInterfacePNPBus );
-#if IPOIB_USE_DMA
- max_phys_mapping = p_adapter->params.cm_enabled ?
- p_adapter->params.cm_xfer_block_size:
p_adapter->params.xfer_block_size;
- max_phys_mapping = p_adapter->params.lso ?
- max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
max_phys_mapping;
- status =
- NdisMInitializeScatterGatherDma( h_adapter, TRUE,
max_phys_mapping );
-
- if( status != NDIS_STATUS_SUCCESS )
- {
- ipoib_destroy_adapter( p_adapter );
- IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
- ("NdisMInitializeScatterGatherDma returned
0x%.8x.\n", status) );
- return status;
- }
-#endif
-
/* Create the adapter adapter */
ib_status = ipoib_start_adapter( p_adapter );
if( ib_status != IB_SUCCESS )
@@ -815,7 +789,7 @@
ipoib_ref_ibat();
IPOIB_EXIT( IPOIB_DBG_INIT );
- return status;
+ return NDIS_STATUS_SUCCESS;
}
Index: kernel/ipoib_port.c
===================================================================
--- kernel/ipoib_port.c (revision 1797)
+++ kernel/ipoib_port.c (working copy)
@@ -896,6 +896,47 @@
("Query CA attributes failed\n" ) );
return status;
}
+
+ if( p_port->p_adapter->params.cm_enabled )
+ {
+ uin32_t ud_payload_mtu = __port_attr_to_mtu_size(
+ p_port->p_ca_attrs->p_port_attr[p_port->port_num
- 1].mtu );
+ /* setup CM payload mtu */
+ p_port->p_adapter->params.cm_payload_mtu =
+ min( MAX_CM_PAYLOAD_MTU,
p_port->p_adapter->params.payload_mtu );
+ p_port->p_adapter->params.cm_xfer_block_size =
+ p_port->p_adapter->params.cm_payload_mtu +
sizeof(eth_hdr_t);
+
+ /* adjust ipoib UD payload MTU to actual port MTU size.
*/
+ /* TODO: should initialization fail if port mtu < 2048 ?
*/
+ p_port->p_adapter->params.payload_mtu =
+ ( max( DEFAULT_MTU, ud_payload_mtu ) -
sizeof(ipoib_hdr_t) );
+ p_port->p_adapter->params.xfer_block_size =
+ (sizeof(eth_hdr_t) +
p_port->p_adapter->params.payload_mtu);
+ }
+#if IPOIB_USE_DMA
+ /* init DMA only once while running MiniportInitialize */
+ if ( !p_port->p_adapter->reset )
+ {
+ ULONG max_phys_mapping;
+
+ max_phys_mapping = p_port->p_adapter->params.cm_enabled
?
+ p_port->p_adapter->params.cm_xfer_block_size :
+ p_port->p_adapter->params.xfer_block_size;
+ max_phys_mapping = p_port->p_adapter->params.lso ?
+ max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
max_phys_mapping;
+ status =
+ NdisMInitializeScatterGatherDma(
+ p_port->p_adapter->h_adapter, TRUE,
max_phys_mapping );
+ if( status != NDIS_STATUS_SUCCESS )
+ {
+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+ ("NdisMInitializeScatterGatherDma
returned 0x%.8x.\n", status) );
+ return status;
+ }
+ }
+#endif
+
/* Allocate the PD. */
status = p_port->p_adapter->p_ifc->alloc_pd(
p_port->ib_mgr.h_ca, IB_PDT_UD, p_port,
&p_port->ib_mgr.h_pd ); @@ -5268,7 +5309,7 @@
IPOIB_ENTER( IPOIB_DBG_ENDPT );
- /* This function must be called from the recieve path */
+ /* This function must be called from the receive path */
CL_ASSERT(p_port->endpt_rdr > 0);
cl_obj_lock( &p_port->obj );
@@ -5423,7 +5464,7 @@
/* Set global routing information. */
ib_path_rec_set_hop_flow_raw( p_path, hop_limit, flow_lbl, FALSE
);
p_path->tclass = p_port->ib_mgr.bcast_rec.tclass;
-
+
cl_obj_unlock( &p_port->obj );
IPOIB_EXIT( IPOIB_DBG_ENDPT );
Index: kernel/ipoib_port.h
===================================================================
--- kernel/ipoib_port.h (revision 1782)
+++ kernel/ipoib_port.h (working copy)
@@ -774,4 +774,22 @@
IN ipoib_port_t* const
p_port,
IN PVOID
cancel_id );
+static inline uint32_t
+__port_attr_to_mtu_size(uint32_t value) {
+ switch (value)
+ {
+ default:
+ case IB_MTU_LEN_2048:
+ return 2048;
+ case IB_MTU_LEN_4096:
+ return 4096;
+ case IB_MTU_LEN_1024:
+ return 1024;
+ case IB_MTU_LEN_512:
+ return 512;
+ case IB_MTU_LEN_256:
+ return 256;
+ }
+}
#endif /* _IPOIB_PORT_H_ */
> Hello,
>
> Thank you for your feedback.
>
> 1. This sturcture was used for header dereference only.
> And CM receive buffers are allocated of size = cm_xfer_block_size.
> Please see __cm_recv_desc_ctor() in ipoib_endpoint.c
> Could you please point me to the spot where memory corruption
> is possible?
>
> 2-3. Yes, this is an issue I'm aware of.
> The major the problem is,I think, that send descriptor is
> used from the stack, but setting MAX_WRS_PER_MSG > 16 would
> make send descriptor too big and cause stack overflow. The
> possible solution would be to use pool of send descriptors
> (similar to what we use on receive side).
>
> 4. For 4k fabrics IPoIB parameter setting sometimes is not enough.
> There are also legacy HCAs and switches can coexist on the
> same fabric.
> However you are right, the way payload_mtu is adjusted could
> be done better.
> I think the best way to handle this is to get actual port mtu
> value from ca query during port initialization,
> Then adjust payload_mtu parameter accordingly.
> (also in this case NdisMInitializeScatterGatherDma should be
> called after param is fixed).
>
> Thoughts?
>
> Thanks,
> Alex.
>
> > -----Original Message-----
> > From: Alex Naslednikov [mailto:xalex at mellanox.co.il]
> > Sent: Thursday, January 08, 2009 11:02 AM
> > To: Alex Estrin
> > Cc: Tzachi Dar; Ishai Rabinovitz; ofw at lists.openfabrics.org
> > Subject: [ofw] [IPoIB_CM] Problems with large pings
> >
> > Hello, Alex,
> > I tried to test IPoIB_CM module (in connected mode, rev.
> > 1797) and found a problem when sending large ping.
> > Settings: MTU_SIZE == 32K (32768), Connected Mode: On, ping
> size: 32K
> > Below is the description of the problem and some investigations.
> > Please, let us know your suggestions and comments regards
> > this issue; meanwhile, we continue to debug.
> >
> >
> > This ping always fails and almost always leads both
> machines to crash.
> > 1. I enlarged the size of _ipoib_pkt!data to be
> > MAX_CM_PAYLOAD_MTU just to avoid possible memory corruption
> > on receive side during the debug.
> > I found that this struct also used during CM operations and
> > theoretically can receive buffers larger than 4K (Max MTU
> size for UD)
> >
> > typedef struct _ipoib_pkt
> > {
> > ipoib_hdr_t hdr;
> > union _payload
> > {
> > - uint8_t data[MAX_UD_PAYLOAD_MTU];
> > + uint8_t data[MAX_CM_PAYLOAD_MTU];
> > ipoib_arp_pkt_t arp;
> > ip_pkt_t ip;
> >
> > } PACK_SUFFIX type;
> >
> > } PACK_SUFFIX ipoib_pkt_t;
> >
> > 2. Now the crash was not reproduced, but ping still didn't work.
> > That's because MAX_WRS_PER_MSG was defined as
> > MAX_CM_PAYLOAD_MTU/MAX_UD_PAYLOAD_MTU.
> > In our case, MAX_WRS_PER_MSG was equal to 16 (62520/4092).
> > In the case of ICMP packet IPoIB_CM still uses the UD mode;
> > i.e. it tries to fragment the message into chunks of 'payload_mtu'.
> > In our case, params.payload_mtu == 2K. That is, Ping of 32K
> > should be fragmented into 16 chunks.
> >
> > From the __send_fragments:
> > seg_len = ( next_sge > (
> > p_port->p_adapter->params.payload_mtu - wr_size ) )?
> > ( p_port->p_adapter->params.payload_mtu - wr_size ) : next_sge;
> >
> > 3. But there's a following check inside __send_fragments:
> >
> > if( wr_idx >= ( MAX_WRS_PER_MSG - 1 ) )
> > return NDIS_STATUS_RESOURCES;
> >
> > In our case, wr_idx always reach 15 (i.e. 16 elements), so
> > this check fails.
> >
> > 4. And there's yet another inconsistence:
> > Not all cards support 4K MTU (in UD mode). Originally, it was
> > possible to enlarge MTU to 4K by using IPoIB parameters.
> > Now consider the following code:
> >
> > p_adapter->params.cm_payload_mtu =
> > min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> > p_adapter->params.cm_xfer_block_size =
> > p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> > p_adapter->params.payload_mtu =
> > min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> > p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> > p_adapter->params.payload_mtu);
> >
> > It means that if you has card that supports 4K MTU,
> > p_adapter->params.payload_mtu will nevertheless get only 2K
> > value (and that's still important in UD mode)
> >
> > Thanks,
> > XaleX
> >
> > _______________________________________________
> ofw mailing list
> ofw at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
>
More information about the ofw
mailing list