[ofw] [IPoIB_CM] Problems with large pings

Alex Estrin alex.estrin at qlogic.com
Mon Jan 12 12:10:36 PST 2009


Hello,

> I would like to propose 2 solutions:
> 1.  using 2 different parameters for UD and CM, or

Yes, we can consider this variant,
although I was trying to keep minimum user-specified parameters. For now at least.

> 2. If CM is disabled, ud payload will take the value from the
> registry;
> otherwise, cm payload will take the value from the registry
> and UD will
> take the maximum possible value
UD payload parameter won't be changed unless CM is enabled.
Please note all adjustments are guarded by 'cm_enabled' flag check
 +      if( p_port->p_adapter->params.cm_enabled )
 +      {
        ....
      }
If flag is not set ud payload will be used as specified in registry and interface will keep running in UD mode.
At least that were my intentions. :)
However just noticed we still have to limit upper size of ud mtu if cm is disabled.
So here is a new version of the patch.

Thanks,
Alex.

Index: kernel/ipoib_driver.c
===================================================================
--- kernel/ipoib_driver.c       (revision 1777)
+++ kernel/ipoib_driver.c       (working copy)
@@ -614,13 +614,13 @@
                                EVENT_IPOIB_CONNECTED_MODE_ERR, 1, 0xbadc0de0 );
                }
        }
-       p_adapter->params.cm_payload_mtu =
-                       min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
-       p_adapter->params.cm_xfer_block_size =
-                       p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
-       p_adapter->params.payload_mtu =
-                       min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
-       p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) + p_adapter->params.payload_mtu);
+       /* adjust ud mtu upper limit if cm is disabled */
+       if( !p_adapter->params.cm_enabled )
+       {
+               p_adapter->params.payload_mtu =
+                       min( MAX_UD_PAYLOAD_MTU, p_adapter->params.payload_mtu );
+       }
+
        NdisReadNetworkAddress( &status, p_mac, p_len, h_config );


@@ -735,13 +735,10 @@
        IN                              NDIS_HANDLE                                     h_adapter,
        IN                              NDIS_HANDLE                                     wrapper_config_context )
 {
-       NDIS_STATUS                     status;
        ib_api_status_t         ib_status;
        UINT                            medium_index;
        ipoib_adapter_t         *p_adapter;
-#if IPOIB_USE_DMA
-       ULONG max_phys_mapping;
-#endif
+
        IPOIB_ENTER( IPOIB_DBG_INIT );

 #ifdef _DEBUG_
@@ -783,23 +780,6 @@
                NDIS_ATTRIBUTE_USES_SAFE_BUFFER_APIS,
                NdisInterfacePNPBus );

-#if IPOIB_USE_DMA
-       max_phys_mapping = p_adapter->params.cm_enabled ?
-               p_adapter->params.cm_xfer_block_size: p_adapter->params.xfer_block_size;
-       max_phys_mapping = p_adapter->params.lso ?
-               max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping): max_phys_mapping;
-       status =
-               NdisMInitializeScatterGatherDma( h_adapter, TRUE, max_phys_mapping );
-
-       if( status != NDIS_STATUS_SUCCESS )
-       {
-               ipoib_destroy_adapter( p_adapter );
-               IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-                       ("NdisMInitializeScatterGatherDma returned 0x%.8x.\n", status) );
-               return status;
-       }
-#endif
-
        /* Create the adapter adapter */
        ib_status = ipoib_start_adapter( p_adapter );
        if( ib_status != IB_SUCCESS )
@@ -815,7 +795,7 @@
        ipoib_ref_ibat();

        IPOIB_EXIT( IPOIB_DBG_INIT );
-       return status;
+       return NDIS_STATUS_SUCCESS;
 }


Index: kernel/ipoib_port.c
===================================================================
--- kernel/ipoib_port.c (revision 1797)
+++ kernel/ipoib_port.c (working copy)
@@ -896,6 +896,47 @@
                        ("Query CA attributes failed\n" ) );
                return status;
        }
+
+       if( p_port->p_adapter->params.cm_enabled )
+       {
+               uin32_t ud_payload_mtu = __port_attr_to_mtu_size(
+                       p_port->p_ca_attrs->p_port_attr[p_port->port_num - 1].mtu );
+               /* setup CM payload mtu */
+               p_port->p_adapter->params.cm_payload_mtu =
+                       min( MAX_CM_PAYLOAD_MTU, p_port->p_adapter->params.payload_mtu );
+               p_port->p_adapter->params.cm_xfer_block_size =
+                       p_port->p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
+
+               /* adjust ipoib UD payload MTU to actual port MTU size. */
+               /* TODO: should initialization fail if port mtu < 2048 ? */
+               p_port->p_adapter->params.payload_mtu =
+                       ( max( DEFAULT_MTU, ud_payload_mtu ) -  sizeof(ipoib_hdr_t) );
+               p_port->p_adapter->params.xfer_block_size =
+                       (sizeof(eth_hdr_t) + p_port->p_adapter->params.payload_mtu);
+       }
+#if IPOIB_USE_DMA
+       /* init DMA only once while running MiniportInitialize */
+       if ( !p_port->p_adapter->reset )
+       {
+               ULONG max_phys_mapping;
+
+               max_phys_mapping = p_port->p_adapter->params.cm_enabled ?
+                       p_port->p_adapter->params.cm_xfer_block_size :
+                       p_port->p_adapter->params.xfer_block_size;
+               max_phys_mapping = p_port->p_adapter->params.lso ?
+                       max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping): max_phys_mapping;
+               status =
+                       NdisMInitializeScatterGatherDma(
+                       p_port->p_adapter->h_adapter, TRUE, max_phys_mapping );
+               if( status != NDIS_STATUS_SUCCESS )
+               {
+                       IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                               ("NdisMInitializeScatterGatherDma returned 0x%.8x.\n", status) );
+                       return status;
+               }
+       }
+#endif
+
        /* Allocate the PD. */
        status = p_port->p_adapter->p_ifc->alloc_pd(
                p_port->ib_mgr.h_ca, IB_PDT_UD, p_port, &p_port->ib_mgr.h_pd );
@@ -5268,7 +5309,7 @@

        IPOIB_ENTER( IPOIB_DBG_ENDPT );

-       /* This function must be called from the recieve path */
+       /* This function must be called from the receive path */
        CL_ASSERT(p_port->endpt_rdr > 0);

        cl_obj_lock( &p_port->obj );
@@ -5423,7 +5464,7 @@
        /* Set global routing information. */
        ib_path_rec_set_hop_flow_raw( p_path, hop_limit, flow_lbl, FALSE );
        p_path->tclass = p_port->ib_mgr.bcast_rec.tclass;
-
+
        cl_obj_unlock( &p_port->obj );

        IPOIB_EXIT( IPOIB_DBG_ENDPT );
Index: kernel/ipoib_port.h
===================================================================
--- kernel/ipoib_port.h (revision 1782)
+++ kernel/ipoib_port.h (working copy)
@@ -774,4 +774,22 @@
        IN                              ipoib_port_t* const             p_port,
        IN                              PVOID                                    cancel_id );

+static inline uint32_t
+__port_attr_to_mtu_size(uint32_t value)
+{
+       switch (value)
+       {
+       default:
+       case IB_MTU_LEN_2048:
+               return 2048;
+       case IB_MTU_LEN_4096:
+               return 4096;
+       case IB_MTU_LEN_1024:
+               return 1024;
+       case IB_MTU_LEN_512:
+               return  512;
+       case IB_MTU_LEN_256:
+               return  256;
+       }
+}
 #endif /* _IPOIB_PORT_H_ */



>
> Hello, Alex,
> Thank you for the patch, it looks good.
> I still have one remark regards (ud) payload :
>    If one uses IPoIB in UD mode, he will not have the
> possibility now to
> configure MTU to the value lower than DEFAULT_MTU.
>
> I would like to propose 2 solutions:
> 1.  using 2 different parameters for UD and CM, or
> 2. If CM is disabled, ud payload will take the value from the
> registry;
> otherwise, cm payload will take the value from the registry
> and UD will
> take the maximum possible value
>
> XaleX
>
> -----Original Message-----
> From: Alex Estrin [mailto:alex.estrin at qlogic.com]
> Sent: Monday, January 12, 2009 8:25 PM
> To: Alex Naslednikov
> Cc: ofw at lists.openfabrics.org
> Subject: RE: [ofw] [IPoIB_CM] Problems with large pings
>
> Hello,
>
> > 4. And there's yet another inconsistence:
> > Not all cards support 4K MTU (in UD mode). Originally, it
> was possible
>
> > to enlarge MTU to 4K by using IPoIB parameters.
> > Now consider the following code:
> >
> >  p_adapter->params.cm_payload_mtu =
> >    min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> > p_adapter->params.cm_xfer_block_size =
> >    p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> > p_adapter->params.payload_mtu =
> >    min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> > p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> > p_adapter->params.payload_mtu);
> >
> > It means that if you has card that supports 4K MTU,
> > p_adapter->params.payload_mtu will nevertheless get only 2K
> value (and
>
> > that's still important in UD mode)
>
> Here is a possible solution for this issue.
> Please review.
>
> Thanks,
> Alex.
>
> Signed-off-by: Alex Estrin <alex.estrin at qlogic.com>
> ----
> Index: kernel/ipoib_driver.c
> ===================================================================
> --- kernel/ipoib_driver.c     (revision 1777)
> +++ kernel/ipoib_driver.c     (working copy)
> @@ -614,13 +614,7 @@
>                               EVENT_IPOIB_CONNECTED_MODE_ERR, 1,
> 0xbadc0de0 );
>               }
>       }
> -     p_adapter->params.cm_payload_mtu =
> -                     min( MAX_CM_PAYLOAD_MTU,
> p_adapter->params.payload_mtu );
> -     p_adapter->params.cm_xfer_block_size =
> -                     p_adapter->params.cm_payload_mtu +
> sizeof(eth_hdr_t);
> -     p_adapter->params.payload_mtu =
> -                     min( DEFAULT_PAYLOAD_MTU,
> p_adapter->params.payload_mtu);
> -     p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> p_adapter->params.payload_mtu);
> +
>       NdisReadNetworkAddress( &status, p_mac, p_len, h_config );
>
>
> @@ -735,13 +729,10 @@
>       IN                              NDIS_HANDLE
> h_adapter,
>       IN                              NDIS_HANDLE
> wrapper_config_context )
>  {
> -     NDIS_STATUS                     status;
>       ib_api_status_t         ib_status;
>       UINT                            medium_index;
>       ipoib_adapter_t         *p_adapter;
> -#if IPOIB_USE_DMA
> -     ULONG max_phys_mapping;
> -#endif
> +
>       IPOIB_ENTER( IPOIB_DBG_INIT );
>
>  #ifdef _DEBUG_
> @@ -783,23 +774,6 @@
>               NDIS_ATTRIBUTE_USES_SAFE_BUFFER_APIS,
>               NdisInterfacePNPBus );
>
> -#if IPOIB_USE_DMA
> -     max_phys_mapping = p_adapter->params.cm_enabled ?
> -             p_adapter->params.cm_xfer_block_size:
> p_adapter->params.xfer_block_size;
> -     max_phys_mapping = p_adapter->params.lso ?
> -             max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
> max_phys_mapping;
> -     status =
> -             NdisMInitializeScatterGatherDma( h_adapter, TRUE,
> max_phys_mapping );
> -
> -     if( status != NDIS_STATUS_SUCCESS )
> -     {
> -             ipoib_destroy_adapter( p_adapter );
> -             IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
> -                     ("NdisMInitializeScatterGatherDma returned
> 0x%.8x.\n", status) );
> -             return status;
> -     }
> -#endif
> -
>       /* Create the adapter adapter */
>       ib_status = ipoib_start_adapter( p_adapter );
>       if( ib_status != IB_SUCCESS )
> @@ -815,7 +789,7 @@
>       ipoib_ref_ibat();
>
>       IPOIB_EXIT( IPOIB_DBG_INIT );
> -     return status;
> +     return NDIS_STATUS_SUCCESS;
>  }
>
>
> Index: kernel/ipoib_port.c
> ===================================================================
> --- kernel/ipoib_port.c       (revision 1797)
> +++ kernel/ipoib_port.c       (working copy)
> @@ -896,6 +896,47 @@
>                       ("Query CA attributes failed\n" ) );
>               return status;
>       }
> +
> +     if( p_port->p_adapter->params.cm_enabled )
> +     {
> +             uin32_t ud_payload_mtu = __port_attr_to_mtu_size(
> +                     p_port->p_ca_attrs->p_port_attr[p_port->port_num
> - 1].mtu );
> +             /* setup CM payload mtu */
> +             p_port->p_adapter->params.cm_payload_mtu =
> +                     min( MAX_CM_PAYLOAD_MTU,
> p_port->p_adapter->params.payload_mtu );
> +             p_port->p_adapter->params.cm_xfer_block_size =
> +                     p_port->p_adapter->params.cm_payload_mtu +
> sizeof(eth_hdr_t);
> +
> +             /* adjust ipoib UD payload MTU to actual port MTU size.
> */
> +             /* TODO: should initialization fail if port mtu < 2048 ?
> */
> +             p_port->p_adapter->params.payload_mtu =
> +                     ( max( DEFAULT_MTU, ud_payload_mtu ) -
> sizeof(ipoib_hdr_t) );
> +             p_port->p_adapter->params.xfer_block_size =
> +                     (sizeof(eth_hdr_t) +
> p_port->p_adapter->params.payload_mtu);
> +     }
> +#if IPOIB_USE_DMA
> +     /* init DMA only once while running MiniportInitialize */
> +     if ( !p_port->p_adapter->reset )
> +     {
> +             ULONG max_phys_mapping;
> +
> +             max_phys_mapping = p_port->p_adapter->params.cm_enabled
> ?
> +                     p_port->p_adapter->params.cm_xfer_block_size :
> +                     p_port->p_adapter->params.xfer_block_size;
> +             max_phys_mapping = p_port->p_adapter->params.lso ?
> +                     max(LARGE_SEND_OFFLOAD_SIZE, max_phys_mapping):
> max_phys_mapping;
> +             status =
> +                     NdisMInitializeScatterGatherDma(
> +                     p_port->p_adapter->h_adapter, TRUE,
> max_phys_mapping );
> +             if( status != NDIS_STATUS_SUCCESS )
> +             {
> +                     IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
> IPOIB_DBG_ERROR,
> +                             ("NdisMInitializeScatterGatherDma
> returned 0x%.8x.\n", status) );
> +                     return status;
> +             }
> +     }
> +#endif
> +
>       /* Allocate the PD. */
>       status = p_port->p_adapter->p_ifc->alloc_pd(
>               p_port->ib_mgr.h_ca, IB_PDT_UD, p_port,
> &p_port->ib_mgr.h_pd ); @@ -5268,7 +5309,7 @@
>
>       IPOIB_ENTER( IPOIB_DBG_ENDPT );
>
> -     /* This function must be called from the recieve path */
> +     /* This function must be called from the receive path */
>       CL_ASSERT(p_port->endpt_rdr > 0);
>
>       cl_obj_lock( &p_port->obj );
> @@ -5423,7 +5464,7 @@
>       /* Set global routing information. */
>       ib_path_rec_set_hop_flow_raw( p_path, hop_limit, flow_lbl, FALSE
> );
>       p_path->tclass = p_port->ib_mgr.bcast_rec.tclass;
> -
> +
>       cl_obj_unlock( &p_port->obj );
>
>       IPOIB_EXIT( IPOIB_DBG_ENDPT );
> Index: kernel/ipoib_port.h
> ===================================================================
> --- kernel/ipoib_port.h       (revision 1782)
> +++ kernel/ipoib_port.h       (working copy)
> @@ -774,4 +774,22 @@
>       IN                              ipoib_port_t* const
> p_port,
>       IN                              PVOID
> cancel_id );
>
> +static inline uint32_t
> +__port_attr_to_mtu_size(uint32_t value) {
> +     switch (value)
> +     {
> +     default:
> +     case IB_MTU_LEN_2048:
> +             return 2048;
> +     case IB_MTU_LEN_4096:
> +             return 4096;
> +     case IB_MTU_LEN_1024:
> +             return 1024;
> +     case IB_MTU_LEN_512:
> +             return  512;
> +     case IB_MTU_LEN_256:
> +             return  256;
> +     }
> +}
>  #endif       /* _IPOIB_PORT_H_ */
>
> > Hello,
> >
> > Thank you for your feedback.
> >
> > 1. This sturcture was used for header dereference only.
> > And CM receive buffers are allocated of size  = cm_xfer_block_size.
> > Please see  __cm_recv_desc_ctor() in ipoib_endpoint.c
> > Could you please point me to the spot where memory corruption
> > is possible?
> >
> > 2-3. Yes, this is an issue I'm aware of.
> > The major the problem is,I think, that send descriptor is
> > used from the stack, but setting MAX_WRS_PER_MSG  > 16 would
> > make send descriptor too big and cause stack overflow. The
> > possible solution would be to use pool of send descriptors
> > (similar to what we use on receive side).
> >
> > 4. For 4k fabrics IPoIB parameter setting sometimes is not enough.
> > There are also legacy HCAs and switches can coexist on the
> > same fabric.
> > However you are right, the way payload_mtu is adjusted could
> > be done better.
> > I think the best way to handle this is to get actual port mtu
> > value from ca query during port initialization,
> > Then adjust payload_mtu parameter accordingly.
> > (also in this case NdisMInitializeScatterGatherDma should be
> > called after param is fixed).
> >
> > Thoughts?
> >
> > Thanks,
> > Alex.
> >
> > > -----Original Message-----
> > > From: Alex Naslednikov [mailto:xalex at mellanox.co.il]
> > > Sent: Thursday, January 08, 2009 11:02 AM
> > > To: Alex Estrin
> > > Cc: Tzachi Dar; Ishai Rabinovitz; ofw at lists.openfabrics.org
> > > Subject: [ofw] [IPoIB_CM] Problems with large pings
> > >
> > > Hello, Alex,
> > > I tried to test IPoIB_CM module (in connected mode, rev.
> > > 1797) and found a problem when sending large ping.
> > > Settings: MTU_SIZE == 32K (32768), Connected Mode: On, ping
> > size: 32K
> > > Below is the description of the problem and some investigations.
> > > Please, let us know your suggestions and comments regards
> > > this issue; meanwhile, we continue to debug.
> > >
> > >
> > > This ping always fails and almost always leads both
> > machines to crash.
> > > 1. I enlarged the size of _ipoib_pkt!data to be
> > > MAX_CM_PAYLOAD_MTU just to avoid possible memory corruption
> > > on receive side during the debug.
> > > I found that this struct also used during CM operations and
> > > theoretically can receive buffers larger than 4K (Max MTU
> > size for UD)
> > >
> > > typedef struct _ipoib_pkt
> > > {
> > >  ipoib_hdr_t  hdr;
> > >  union _payload
> > >  {
> > > - uint8_t   data[MAX_UD_PAYLOAD_MTU];
> > > + uint8_t   data[MAX_CM_PAYLOAD_MTU];
> > >   ipoib_arp_pkt_t arp;
> > >   ip_pkt_t  ip;
> > >
> > >  } PACK_SUFFIX type;
> > >
> > > } PACK_SUFFIX ipoib_pkt_t;
> > >
> > > 2. Now the crash was not reproduced, but ping still didn't work.
> > > That's because MAX_WRS_PER_MSG was defined as
> > > MAX_CM_PAYLOAD_MTU/MAX_UD_PAYLOAD_MTU.
> > > In our case, MAX_WRS_PER_MSG  was equal to 16 (62520/4092).
> > > In the case of ICMP packet IPoIB_CM still uses the UD mode;
> > > i.e. it tries to fragment the message into chunks of
> 'payload_mtu'.
> > > In our case, params.payload_mtu == 2K. That is, Ping of 32K
> > > should be fragmented into 16 chunks.
> > >
> > > From the __send_fragments:
> > > seg_len = ( next_sge > (
> > > p_port->p_adapter->params.payload_mtu - wr_size ) )?
> > >     ( p_port->p_adapter->params.payload_mtu - wr_size ) :
> next_sge;
> > >
> > > 3. But there's a following check inside __send_fragments:
> > >
> > >   if( wr_idx >= ( MAX_WRS_PER_MSG - 1 ) )
> > >    return NDIS_STATUS_RESOURCES;
> > >
> > > In our case, wr_idx always reach 15 (i.e. 16 elements), so
> > > this check fails.
> > >
> > > 4. And there's yet another inconsistence:
> > > Not all cards support 4K MTU (in UD mode). Originally, it was
> > > possible to enlarge MTU to 4K by using IPoIB parameters.
> > > Now consider the following code:
> > >
> > >  p_adapter->params.cm_payload_mtu =
> > >    min( MAX_CM_PAYLOAD_MTU, p_adapter->params.payload_mtu );
> > >  p_adapter->params.cm_xfer_block_size =
> > >    p_adapter->params.cm_payload_mtu + sizeof(eth_hdr_t);
> > >  p_adapter->params.payload_mtu =
> > >    min( DEFAULT_PAYLOAD_MTU, p_adapter->params.payload_mtu);
> > >  p_adapter->params.xfer_block_size = (sizeof(eth_hdr_t) +
> > > p_adapter->params.payload_mtu);
> > >
> > > It means that if you has card that supports 4K MTU,
> > > p_adapter->params.payload_mtu will nevertheless get only 2K
> > > value (and that's still important in UD mode)
> > >
> > > Thanks,
> > > XaleX
> > >
> > > _______________________________________________
> > ofw mailing list
> > ofw at lists.openfabrics.org
> > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> >
>



More information about the ofw mailing list