[ofw] [IPoIB_CM] Problems with large pings
Alex Estrin
alex.estrin at qlogic.com
Mon Jan 12 12:10:36 PST 2009
Hello,
> I would like to propose 2 solutions:
> 1. using 2 different parameters for UD and CM, or
Yes, we can consider this variant,
although I was trying to keep minimum user-specified parameters. For now at least.
> 2. If CM is disabled, ud payload will take the value from the
> registry;
> otherwise, cm payload will take the value from the registry
> and UD will
> take the maximum possible value
UD payload parameter won't be changed unless CM is enabled.
Please note all adjustments are guarded by 'cm_enabled' flag check
+ if( p_port->p_adapter->params.cm_enabled )
+ {
....
}
If flag is not set ud payload will be used as specified in registry and interface will keep running in UD mode.
At least that were my intentions. :)
However just noticed we still have to limit upper size of ud mtu if cm is disabled.
So here is a new version of the patch.
Thanks,
Alex.
Index: kernel/ipoib_driver.c
===================================================================
--- kernel/ipoib_driver.c (revision 1777)
+++ kernel/ipoib_driver.c (working copy)
@@ -614,13 +614,13 @@
EVENT_IPOIB_CONNECTED_MODE_ERR, 1, 0xbadc0de0 );
}
}
- p_adapter->params.cm_payload_mtu =
- min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
- p_adapter->params.cm_xfer_block_size =
- p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
- p_adapter->params.payload_mtu =
- min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
- p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) + p_adapter->params.payload_mtu);
+ /* adjust ud mtu upper limit if cm is disabled */
+ if( !p_adapter->params.cm_enabled )
+ {
+ p_adapter->params.payload_mtu =
+ min( MAX_UD_PAYLOAD_MTU, p_adapter->params.payload_mtu );
+ }
+
NdisReadNetworkAddress( &status, p_mac, p_len, h_config );
@@ -735,13 +735,10 @@
IN NDIS_HANDLE h_adapter,
IN NDIS_HANDLE wrapper_config_context )
{
- NDIS_STATUS status;
ib_api_status_t ib_status;
UINT medium_index;
ipoib_adapter_t *p_adapter;
-#if IPOIB_USE_DMA
- ULONG max_phys_mapping;
-#endif
+
IPOIB_ENTER( IPOIB_DBG_INIT );
#ifdef _DEBUG_
@@ -783,23 +780,6 @@
NDIS_ATTRIBUTE_USES_SAFE_BUFFER_APIS,
NdisInterfacePNPBus );
-#if IPOIB_USE_DMA
- max_phys_mapping = p_adapter->params.cm_enabled ?
- p_adapter->params.cm_xfer_block_size: p_adapter->params.xfer_block_size;
- max_phys_mapping = p_adapter->params.lso ?
- max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping): max_phys_mapping;
- status =
- NdisMInitializeScatterGatherDma( h_adapter, TRUE, max_phys_mapping );
-
- if( status != NDIS_STATUS_SUCCESS )
- {
- ipoib_destroy_adapter( p_adapter );
- IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
- ("NdisMInitializeScatterGatherDma returned 0x%.8x.\n", status) );
- return status;
- }
-#endif
-
/* Create the adapter adapter */
ib_status = ipoib_start_adapter( p_adapter );
if( ib_status != IB_SUCCESS )
@@ -815,7 +795,7 @@
ipoib_ref_ibat();
IPOIB_EXIT( IPOIB_DBG_INIT );
- return status;
+ return NDIS_STATUS_SUCCESS;
}
Index: kernel/ipoib_port.c
===================================================================
--- kernel/ipoib_port.c (revision 1797)
+++ kernel/ipoib_port.c (working copy)
@@ -896,6 +896,47 @@
("Query CA attributes failed\n" ) );
return status;
}
+
+ if( p_port->p_adapter->params.cm_enabled )
+ {
+ uin32_t ud_payload_mtu = __port_attr_to_mtu_size(
+ p_port->p_ca_attrs->p_port_attr[p_port->port_num - 1].mtu );
+ /* setup CM payload mtu */
+ p_port->p_adapter->params.cm_payload_mtu =
+ min( MAX_CM_PAYLOAD_MTU, p_port->p_adapter->params.payload_mtu );
+ p_port->p_adapter->params.cm_xfer_block_size =
+ p_port->p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
+
+ /* adjust ipoib UD payload MTU to actual port MTU size. */
+ /* TODO: should initialization fail if port mtu < 2048 ? */
+ p_port->p_adapter->params.payload_mtu =
+ ( max( DEFAULT_MTU, ud_payload_mtu ) - sizeof(ipoib_hdr_t) );
+ p_port->p_adapter->params.xfer_block_size =
+ (sizeof(eth_hdr_t) + p_port->p_adapter->params.payload_mtu);
+ }
+#if IPOIB_USE_DMA
+ /* init DMA only once while running MiniportInitialize */
+ if ( !p_port->p_adapter->reset )
+ {
+ ULONG max_phys_mapping;
+
+ max_phys_mapping = p_port->p_adapter->params.cm_enabled ?
+ p_port->p_adapter->params.cm_xfer_block_size :
+ p_port->p_adapter->params.xfer_block_size;
+ max_phys_mapping = p_port->p_adapter->params.lso ?
+ max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping): max_phys_mapping;
+ status =
+ NdisMInitializeScatterGatherDma(
+ p_port->p_adapter->h_adapter, TRUE, max_phys_mapping );
+ if( status != NDIS_STATUS_SUCCESS )
+ {
+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+ ("NdisMInitializeScatterGatherDma returned 0x%.8x.\n", status) );
+ return status;
+ }
+ }
+#endif
+
/* Allocate the PD. */
status = p_port->p_adapter->p_ifc->alloc_pd(
p_port->ib_mgr.h_ca, IB_PDT_UD, p_port, &p_port->ib_mgr.h_pd );
@@ -5268,7 +5309,7 @@
IPOIB_ENTER( IPOIB_DBG_ENDPT );
- /* This function must be called from the recieve path */
+ /* This function must be called from the receive path */
CL_ASSERT(p_port->endpt_rdr > 0);
cl_obj_lock( &p_port->obj );
@@ -5423,7 +5464,7 @@
/* Set global routing information. */
ib_path_rec_set_hop_flow_raw( p_path, hop_limit, flow_lbl, FALSE );
p_path->tclass = p_port->ib_mgr.bcast_rec.tclass;
-
+
cl_obj_unlock( &p_port->obj );
IPOIB_EXIT( IPOIB_DBG_ENDPT );
Index: kernel/ipoib_port.h
===================================================================
--- kernel/ipoib_port.h (revision 1782)
+++ kernel/ipoib_port.h (working copy)
@@ -774,4 +774,22 @@
IN ipoib_port_t* const p_port,
IN PVOID cancel_id );
+static inline uint32_t
+__port_attr_to_mtu_size(uint32_t value)
+{
+ switch (value)
+ {
+ default:
+ case IB_MTU_LEN_2048:
+ return 2048;
+ case IB_MTU_LEN_4096:
+ return 4096;
+ case IB_MTU_LEN_1024:
+ return 1024;
+ case IB_MTU_LEN_512:
+ return 512;
+ case IB_MTU_LEN_256:
+ return 256;
+ }
+}
#endif /* _IPOIB_PORT_H_ */
>
> Hello, Alex,
> Thank you for the patch, it looks good.
> I still have one remark regards (ud) payload :
> If one uses IPoIB in UD mode, he will not have the
> possibility now to
> configure MTU to the value lower than DEFAULT_MTU.
>
> I would like to propose 2 solutions:
> 1. using 2 different parameters for UD and CM, or
> 2. If CM is disabled, ud payload will take the value from the
> registry;
> otherwise, cm payload will take the value from the registry
> and UD will
> take the maximum possible value
>
> XaleX
>
> -----Original Message-----
> From: Alex Estrin [mailto:alex.estrin at qlogic.com]
> Sent: Monday, January 12, 2009 8:25 PM
> To: Alex Naslednikov
> Cc: ofw at lists.openfabrics.org
> Subject: RE: [ofw] [IPoIB_CM] Problems with large pings
>
> Hello,
>
> > 4. And there's yet another inconsistence:
> > Not all cards support 4K MTU (in UD mode). Originally, it
> was possible
>
> > to enlarge MTU to 4K by using IPoIB parameters.
> > Now consider the following code:
> >
> > p_adapter->params.cm_payload_mtu =
> > min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> > p_adapter->params.cm_xfer_block_size =
> > p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> > p_adapter->params.payload_mtu =
> > min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> > p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> > p_adapter->params.payload_mtu);
> >
> > It means that if you has card that supports 4K MTU,
> > p_adapter->params.payload_mtu will nevertheless get only 2K
> value (and
>
> > that's still important in UD mode)
>
> Here is a possible solution for this issue.
> Please review.
>
> Thanks,
> Alex.
>
> Signed-off-by: Alex Estrin <alex.estrin at qlogic.com>
> ----
> Index: kernel/ipoib_driver.c
> ===================================================================
> --- kernel/ipoib_driver.c (revision 1777)
> +++ kernel/ipoib_driver.c (working copy)
> @@ -614,13 +614,7 @@
> EVENT_IPOIB_CONNECTED_MODE_ERR, 1,
> 0xbadc0de0 );
> }
> }
> - p_adapter->params.cm_payload_mtu =
> - min( MAX_CM_PAYLOAD_MTU,
> p_adapter->params.payload_mtu );
> - p_adapter->params.cm_xfer_block_size =
> - p_adapter->params.cm_payload_mtu +
> sizeof(eth_hdr_t);
> - p_adapter->params.payload_mtu =
> - min( DEFAULT_PAYLOAD_MTU,
> p_adapter->params.payload_mtu);
> - p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> p_adapter->params.payload_mtu);
> +
> NdisReadNetworkAddress( &status, p_mac, p_len, h_config );
>
>
> @@ -735,13 +729,10 @@
> IN NDIS_HANDLE
> h_adapter,
> IN NDIS_HANDLE
> wrapper_config_context )
> {
> - NDIS_STATUS status;
> ib_api_status_t ib_status;
> UINT medium_index;
> ipoib_adapter_t *p_adapter;
> -#if IPOIB_USE_DMA
> - ULONG max_phys_mapping;
> -#endif
> +
> IPOIB_ENTER( IPOIB_DBG_INIT );
>
> #ifdef _DEBUG_
> @@ -783,23 +774,6 @@
> NDIS_ATTRIBUTE_USES_SAFE_BUFFER_APIS,
> NdisInterfacePNPBus );
>
> -#if IPOIB_USE_DMA
> - max_phys_mapping = p_adapter->params.cm_enabled ?
> - p_adapter->params.cm_xfer_block_size:
> p_adapter->params.xfer_block_size;
> - max_phys_mapping = p_adapter->params.lso ?
> - max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
> max_phys_mapping;
> - status =
> - NdisMInitializeScatterGatherDma( h_adapter, TRUE,
> max_phys_mapping );
> -
> - if( status != NDIS_STATUS_SUCCESS )
> - {
> - ipoib_destroy_adapter( p_adapter );
> - IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> - ("NdisMInitializeScatterGatherDma returned
> 0x%.8x.\n", status) );
> - return status;
> - }
> -#endif
> -
> /* Create the adapter adapter */
> ib_status = ipoib_start_adapter( p_adapter );
> if( ib_status != IB_SUCCESS )
> @@ -815,7 +789,7 @@
> ipoib_ref_ibat();
>
> IPOIB_EXIT( IPOIB_DBG_INIT );
> - return status;
> + return NDIS_STATUS_SUCCESS;
> }
>
>
> Index: kernel/ipoib_port.c
> ===================================================================
> --- kernel/ipoib_port.c (revision 1797)
> +++ kernel/ipoib_port.c (working copy)
> @@ -896,6 +896,47 @@
> ("Query CA attributes failed\n" ) );
> return status;
> }
> +
> + if( p_port->p_adapter->params.cm_enabled )
> + {
> + uin32_t ud_payload_mtu = __port_attr_to_mtu_size(
> + p_port->p_ca_attrs->p_port_attr[p_port->port_num
> - 1].mtu );
> + /* setup CM payload mtu */
> + p_port->p_adapter->params.cm_payload_mtu =
> + min( MAX_CM_PAYLOAD_MTU,
> p_port->p_adapter->params.payload_mtu );
> + p_port->p_adapter->params.cm_xfer_block_size =
> + p_port->p_adapter->params.cm_payload_mtu +
> sizeof(eth_hdr_t);
> +
> + /* adjust ipoib UD payload MTU to actual port MTU size.
> */
> + /* TODO: should initialization fail if port mtu < 2048 ?
> */
> + p_port->p_adapter->params.payload_mtu =
> + ( max( DEFAULT_MTU, ud_payload_mtu ) -
> sizeof(ipoib_hdr_t) );
> + p_port->p_adapter->params.xfer_block_size =
> + (sizeof(eth_hdr_t) +
> p_port->p_adapter->params.payload_mtu);
> + }
> +#if IPOIB_USE_DMA
> + /* init DMA only once while running MiniportInitialize */
> + if ( !p_port->p_adapter->reset )
> + {
> + ULONG max_phys_mapping;
> +
> + max_phys_mapping = p_port->p_adapter->params.cm_enabled
> ?
> + p_port->p_adapter->params.cm_xfer_block_size :
> + p_port->p_adapter->params.xfer_block_size;
> + max_phys_mapping = p_port->p_adapter->params.lso ?
> + max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
> max_phys_mapping;
> + status =
> + NdisMInitializeScatterGatherDma(
> + p_port->p_adapter->h_adapter, TRUE,
> max_phys_mapping );
> + if( status != NDIS_STATUS_SUCCESS )
> + {
> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
> IPOIB_DBG_ERROR,
> + ("NdisMInitializeScatterGatherDma
> returned 0x%.8x.\n", status) );
> + return status;
> + }
> + }
> +#endif
> +
> /* Allocate the PD. */
> status = p_port->p_adapter->p_ifc->alloc_pd(
> p_port->ib_mgr.h_ca, IB_PDT_UD, p_port,
> &p_port->ib_mgr.h_pd ); @@ -5268,7 +5309,7 @@
>
> IPOIB_ENTER( IPOIB_DBG_ENDPT );
>
> - /* This function must be called from the recieve path */
> + /* This function must be called from the receive path */
> CL_ASSERT(p_port->endpt_rdr > 0);
>
> cl_obj_lock( &p_port->obj );
> @@ -5423,7 +5464,7 @@
> /* Set global routing information. */
> ib_path_rec_set_hop_flow_raw( p_path, hop_limit, flow_lbl, FALSE
> );
> p_path->tclass = p_port->ib_mgr.bcast_rec.tclass;
> -
> +
> cl_obj_unlock( &p_port->obj );
>
> IPOIB_EXIT( IPOIB_DBG_ENDPT );
> Index: kernel/ipoib_port.h
> ===================================================================
> --- kernel/ipoib_port.h (revision 1782)
> +++ kernel/ipoib_port.h (working copy)
> @@ -774,4 +774,22 @@
> IN ipoib_port_t* const
> p_port,
> IN PVOID
> cancel_id );
>
> +static inline uint32_t
> +__port_attr_to_mtu_size(uint32_t value) {
> + switch (value)
> + {
> + default:
> + case IB_MTU_LEN_2048:
> + return 2048;
> + case IB_MTU_LEN_4096:
> + return 4096;
> + case IB_MTU_LEN_1024:
> + return 1024;
> + case IB_MTU_LEN_512:
> + return 512;
> + case IB_MTU_LEN_256:
> + return 256;
> + }
> +}
> #endif /* _IPOIB_PORT_H_ */
>
> > Hello,
> >
> > Thank you for your feedback.
> >
> > 1. This sturcture was used for header dereference only.
> > And CM receive buffers are allocated of size = cm_xfer_block_size.
> > Please see __cm_recv_desc_ctor() in ipoib_endpoint.c
> > Could you please point me to the spot where memory corruption
> > is possible?
> >
> > 2-3. Yes, this is an issue I'm aware of.
> > The major the problem is,I think, that send descriptor is
> > used from the stack, but setting MAX_WRS_PER_MSG > 16 would
> > make send descriptor too big and cause stack overflow. The
> > possible solution would be to use pool of send descriptors
> > (similar to what we use on receive side).
> >
> > 4. For 4k fabrics IPoIB parameter setting sometimes is not enough.
> > There are also legacy HCAs and switches can coexist on the
> > same fabric.
> > However you are right, the way payload_mtu is adjusted could
> > be done better.
> > I think the best way to handle this is to get actual port mtu
> > value from ca query during port initialization,
> > Then adjust payload_mtu parameter accordingly.
> > (also in this case NdisMInitializeScatterGatherDma should be
> > called after param is fixed).
> >
> > Thoughts?
> >
> > Thanks,
> > Alex.
> >
> > > -----Original Message-----
> > > From: Alex Naslednikov [mailto:xalex at mellanox.co.il]
> > > Sent: Thursday, January 08, 2009 11:02 AM
> > > To: Alex Estrin
> > > Cc: Tzachi Dar; Ishai Rabinovitz; ofw at lists.openfabrics.org
> > > Subject: [ofw] [IPoIB_CM] Problems with large pings
> > >
> > > Hello, Alex,
> > > I tried to test IPoIB_CM module (in connected mode, rev.
> > > 1797) and found a problem when sending large ping.
> > > Settings: MTU_SIZE == 32K (32768), Connected Mode: On, ping
> > size: 32K
> > > Below is the description of the problem and some investigations.
> > > Please, let us know your suggestions and comments regards
> > > this issue; meanwhile, we continue to debug.
> > >
> > >
> > > This ping always fails and almost always leads both
> > machines to crash.
> > > 1. I enlarged the size of _ipoib_pkt!data to be
> > > MAX_CM_PAYLOAD_MTU just to avoid possible memory corruption
> > > on receive side during the debug.
> > > I found that this struct also used during CM operations and
> > > theoretically can receive buffers larger than 4K (Max MTU
> > size for UD)
> > >
> > > typedef struct _ipoib_pkt
> > > {
> > > ipoib_hdr_t hdr;
> > > union _payload
> > > {
> > > - uint8_t data[MAX_UD_PAYLOAD_MTU];
> > > + uint8_t data[MAX_CM_PAYLOAD_MTU];
> > > ipoib_arp_pkt_t arp;
> > > ip_pkt_t ip;
> > >
> > > } PACK_SUFFIX type;
> > >
> > > } PACK_SUFFIX ipoib_pkt_t;
> > >
> > > 2. Now the crash was not reproduced, but ping still didn't work.
> > > That's because MAX_WRS_PER_MSG was defined as
> > > MAX_CM_PAYLOAD_MTU/MAX_UD_PAYLOAD_MTU.
> > > In our case, MAX_WRS_PER_MSG was equal to 16 (62520/4092).
> > > In the case of ICMP packet IPoIB_CM still uses the UD mode;
> > > i.e. it tries to fragment the message into chunks of
> 'payload_mtu'.
> > > In our case, params.payload_mtu == 2K. That is, Ping of 32K
> > > should be fragmented into 16 chunks.
> > >
> > > From the __send_fragments:
> > > seg_len = ( next_sge > (
> > > p_port->p_adapter->params.payload_mtu - wr_size ) )?
> > > ( p_port->p_adapter->params.payload_mtu - wr_size ) :
> next_sge;
> > >
> > > 3. But there's a following check inside __send_fragments:
> > >
> > > if( wr_idx >= ( MAX_WRS_PER_MSG - 1 ) )
> > > return NDIS_STATUS_RESOURCES;
> > >
> > > In our case, wr_idx always reach 15 (i.e. 16 elements), so
> > > this check fails.
> > >
> > > 4. And there's yet another inconsistence:
> > > Not all cards support 4K MTU (in UD mode). Originally, it was
> > > possible to enlarge MTU to 4K by using IPoIB parameters.
> > > Now consider the following code:
> > >
> > > p_adapter->params.cm_payload_mtu =
> > > min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> > > p_adapter->params.cm_xfer_block_size =
> > > p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> > > p_adapter->params.payload_mtu =
> > > min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> > > p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> > > p_adapter->params.payload_mtu);
> > >
> > > It means that if you has card that supports 4K MTU,
> > > p_adapter->params.payload_mtu will nevertheless get only 2K
> > > value (and that's still important in UD mode)
> > >
> > > Thanks,
> > > XaleX
> > >
> > > _______________________________________________
> > ofw mailing list
> > ofw at lists.openfabrics.org
> > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> >
>
More information about the ofw
mailing list