[ofw] [PATCH 3/3] Mlx4 (checksum): improving the mechanism of csum offload
Smith, Stan
stan.smith at intel.com
Thu Sep 4 12:54:52 PDT 2008
Please see inline comments.
Thanks.
Tzachi Dar wrote:
> See bellow.
>
> Thanks
> Tzachi
>
>> -----Original Message-----
>> From: Smith, Stan [mailto:stan.smith at intel.com]
>> Sent: Thursday, September 04, 2008 10:16 PM
>> To: Alex Naslednikov; Tzachi Dar; ofw at lists.openfabrics.org
>> Subject: RE: [ofw] [PATCH 3/3] Mlx4 (checksum): improving the
>> mechanism of csum offload
>>
>> Hello Alex,
>>
>> Some questions:
>>
>> 1) Am I correct in understanding that ipoib offload is turned 'on'
>> by default?
>>
> The mode that is turned on by default, is "do checksum in HW if HW
> supports it, or by SW if HW doesn't support that". Probably the best
> mode for everyone.
>
>> 2) For debug/testing purposes, how does one turn ipoib
>> offload 'off' or 'on'?
>
> I have once sent such a script, I'll try to find it or send a new one.
>
Should this script be part of IPoIB offload documentation?
>
>>
>> 3) In the patch for ipoib_port.c, when making the call
>> query_ca(p_port->ib_mgr.h_ca, NULL, &attr_size );
>> I do not see attr_size initialized, hence it could be a very
>> large integer (stack garbage). The garbage value could pass the
>> 1st call to get the attr size, then the following
>> cl_malloc(attr_size) call could fail
>> due to the very large integer.
>> Do you agree?
>
> In the function mlnx_query_ca there is the following code:
> // resource sufficience check
> if (NULL == p_ca_attr || *p_byte_count < required_size) {
> *p_byte_count = required_size;
> status = IB_INSUFFICIENT_MEMORY;
> if ( p_ca_attr != NULL) {
> HCA_PRINT (TRACE_LEVEL_ERROR,HCA_DBG_SHIM,
> ("Failed *p_byte_count (%d) <
> required_size (%d)\n", *p_byte_count, required_size ));
> }
> goto err_insuff_mem;
> }
> Since p_ca_attr== NULL there shouldn't be a problem, the value is
> ignored.
OK, situation covered.
Thanks.
>
>
>>
>> Please notify the list when the patches are applied and committed to
>> svn.
>>
>> Thanks,
>>
>> Stan.
>>
>>
>> Alex Naslednikov wrote:
>>> This is cumulative patch that fixes all open issues shown the last
>>> week: SGE wrong calculation LSO messages in log files
>>> Fixes in ipoib csum offload mechanism
>>> Adding 3 states for CSUM offload : disabled, enabled if supported
>>> by HW and bypass (see inline)
>>>
>>> Signed-off by: xalex (Alexander Naslednikov)
>>> Index:
>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_port.c
>>> ===================================================================
>>> --- D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_port.c
>>> (revision 3106) +++
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_port.c
>>> (revision 3107) @@ -825,6 +825,8 @@ uint64_t
>>> vaddr; net32_t rkey;
>>> ib_qp_attr_t qp_attr;
>>> + ib_ca_attr_t * p_ca_attr;
>>> + uint32_t attr_size;
>>>
>>> IPOIB_ENTER( IPOIB_DBG_INIT );
>>>
>>> @@ -889,7 +891,7 @@
>>>
>> p_port->p_adapter->p_ifc->get_err_str( status
>>> )) );
>>> return status;
>>> }
>>> -
>>> +
>>> /* Allocate the QP. */
>>> cl_memclr( &qp_create, sizeof(qp_create) );
>>> qp_create.qp_type = IB_QPT_UNRELIABLE_DGRM; @@ -897,8
>>> +899,47 @@ qp_create.rq_sge = 2; /* To support buffers
>>> spanning pages. */ qp_create.h_rq_cq =
>>> p_port->ib_mgr.h_recv_cq; qp_create.sq_depth =
>>> p_port->p_adapter->params.sq_depth;
>>> - //TODO: Figure out the right number of SGE entries for
>>> sends.
>>> - qp_create.sq_sge = MAX_SEND_SGE;
>>> +
>>> + //Figure out the right number of SGE entries for sends.
>>> + /* Get the size of the CA attribute structure. */
>>> + status = p_port->p_adapter->p_ifc->query_ca(
>>> p_port->ib_mgr.h_ca, NULL, &attr_size );
>>> + if( status != IB_INSUFFICIENT_MEMORY )
>>> + {
>>> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>>> IPOIB_DBG_ERROR, + ("ib_query_ca failed with
>>> status %s.\n", p_port->p_adapter->p_ifc->get_err_str(status)) );
>>> + return status;
>>> + }
>>> +
>>> + /* Allocate enough space to store the attribute structure.
>>> */ + p_ca_attr = cl_malloc( attr_size );
>>> + if( !p_ca_attr )
>>> + {
>>> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>>> IPOIB_DBG_ERROR, + ("cl_malloc failed to
>>> allocate p_ca_attr!\n") ); + return
>>> IB_INSUFFICIENT_RESOURCES; + } +
>>> + /* Query the CA attributes. */
>>> + status =
>>> p_port->p_adapter->p_ifc->query_ca(p_port->ib_mgr.h_ca, p_ca_attr,
>>> &attr_size ); + if( status != IB_SUCCESS )
>>> + {
>>> + cl_free( p_ca_attr );
>>> +
>>> + IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>>> IPOIB_DBG_ERROR, + ("ib_query_ca failed with
>>> status %s.\n", p_port->p_adapter->p_ifc->get_err_str(status)) );
>>> + return status;
>>> + }
>>> +#define UD_QP_USED_SGE 3
>>> + qp_create.sq_sge = MAX_SEND_SGE < p_ca_attr->max_sges ?
>>> MAX_SEND_SGE : (p_ca_attr->max_sges - UD_QP_USED_SGE);
>>> + if (!p_ca_attr->ipoib_csum) {
>>> + //checksum is not supported by device
>>> + //user must specify BYPASS to explicitly cancel
>>> checksum calculation + if
>>> (p_port->p_adapter->params.send_chksum_offload == CSUM_ENABLED) +
>> p_port->p_adapter->params.send_chksum_offload
>>> = CSUM_DISABLED;
>>> + if (p_port->p_adapter->params.recv_chksum_offload
>>> == CSUM_ENABLED) +
>> p_port->p_adapter->params.recv_chksum_offload
>>> = CSUM_DISABLED;
>>> + }
>>> +
>>> qp_create.h_sq_cq = p_port->ib_mgr.h_send_cq;
>>> qp_create.sq_signaled = TRUE;
>>> status = p_port->p_adapter->p_ifc->create_qp( @@ -1937,7
>>> +1978,7 @@
>>>
>>> }
>>> /* Successful completion. Get the receive
>>> information. */ - p_desc->ndis_csum.Value = (ULONG)
>>> p_wc->csum_ok; + p_desc->ndis_csum.Value = (ULONG)
>>> p_wc->recv.ud.csum_ok; cl_perf_start( GetRecvEndpts
>>> ); __recv_get_endpts( p_port, p_desc, p_wc, &p_src,
>>> &p_dst ); cl_perf_stop( &p_port->p_adapter->perf,
>>> GetRecvEndpts ); @@ -2580,10 +2621,22 @@
>>> ("__buf_mgr_get_ndis_pkt failed\n") ); return
>>> IB_INSUFFICIENT_RESOURCES; }
>>> -
>>> + if (p_port->p_adapter->params.recv_chksum_offload !=
>>> CSUM_BYPASS) { /* Get the checksums directly from packet
>>> information. */
>>> - NDIS_PER_PACKET_INFO_FROM_PACKET( *pp_packet,
>>> TcpIpChecksumPacketInfo ) =
>>> - (PVOID) (uintn_t) (p_desc->ndis_csum.Value);
>>> + /* In this case, no one of cheksum's cat get false value */
>>> + /* If hardware checksum failed or wasn't calculated, NDIS
>>> will recalculate it again */ +
>>> NDIS_PER_PACKET_INFO_FROM_PACKET( *pp_packet,
>>> TcpIpChecksumPacketInfo ) = + (PVOID)
>>> (uintn_t) (p_desc->ndis_csum.Value); + } else { +
>>> NDIS_TCP_IP_CHECKSUM_PACKET_INFO chksum; + /*
>>> Flag the checksums as having been calculated. */ +
>>> chksum.Value = 0; +
>> chksum.Receive.NdisPacketTcpChecksumSucceeded = TRUE;
>>> +
>> chksum.Receive.NdisPacketUdpChecksumSucceeded = TRUE;
>>> + chksum.Receive.NdisPacketIpChecksumSucceeded = TRUE;
>>> + NDIS_PER_PACKET_INFO_FROM_PACKET( *pp_packet,
>>> TcpIpChecksumPacketInfo ) = +
>>> (void*)(uintn_t)chksum.Value; + }
>>> ipoib_inc_recv_stat( p_port->p_adapter, type, p_desc->len );
>>>
>>> IPOIB_EXIT( IPOIB_DBG_RECV );
>>> @@ -3833,9 +3886,9 @@
>>> p_desc->wr.dgrm.ud.hlen = lso_header_size;
>>> // Tell NDIS how much we will send.
>>> PktExt->NdisPacketInfo[TcpLargeSendPacketInfo] =
>>> UlongToPtr(PacketLength); - p_desc->wr.send_opt |=
>>> (IB_SEND_OPT_TX_IP_CSUM | IB_SEND_OPT_TX_TCP_UDP_CSUM); +
>>> p_desc->wr.send_opt |= (IB_SEND_OPT_TX_IP_CSUM |
>>> IB_SEND_OPT_TX_TCP_UDP_CSUM) |
>>> IB_SEND_OPT_SIGNALED; __send_gen(p_port, p_desc, IndexOfData); -
>>> p_desc->wr.wr_type = ( WR_LSO | IB_SEND_OPT_SIGNALED); +
>>> p_desc->wr.wr_type = WR_LSO; } else {
>>>
>>> /* Setup the first local data segment (used for the
>>> IPoIB header). */ Index:
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_adapter.h
>>> ===================================================================
>>> ---
>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_adapter.h
>>> (revision 3106) +++
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_adapter.h
>>> (revision 3107) @@ -60,15 +60,15 @@
>>> /*
>>> * Macros
>>> */
>>> +typedef enum {CSUM_DISABLED = 0, CSUM_ENABLED, CSUM_BYPASS}
>>> tCsumTypeEn;
>>>
>>> -
>>> typedef struct _ipoib_params
>>> {
>>> int32_t rq_depth;
>>> int32_t rq_low_watermark;
>>> int32_t sq_depth;
>>> - boolean_t send_chksum_offload;
>>> - boolean_t recv_chksum_offload;
>>> + int32_t send_chksum_offload; //is actually of type
>>> tCsumTypeEn + int32_t recv_chksum_offload; //is
>>> actually of type tCsumTypeEn uint32_t sa_timeout;
>>> uint32_t sa_retry_cnt;
>>> uint32_t recv_pool_ratio;
>>> @@ -91,12 +91,11 @@
>>> * Number of send WQEs to allocate.
>>> *
>>> * send_chksum_offload
>>> -* Flag to indicate whether to offload send checksums.
>>> This will make it
>>> -* so that IPoIB packets should never be forwarded out
>>> of the IB subnet
>>> -* without recalculating the checksum. -*
>>> * recv_chksum_offload
>>> -* Flag to indicate whether to offload recv checksums.
>>> +* Flags to indicate whether to offload send/recv
>>> checksums. +* 0 - No hardware cheksum
>>> +* 1 - Try to offload if the device support it
>>> +* 2 - Always report success (checksum bypass) *
>>> * wsdp_enabled
>>> * Flag to indicate whether WSDP is enabled for an
>>> adapter adapter. Index:
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_driver.c
>>> ===================================================================
>>> ---
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_driver.c
>>> (revision 3106) +++
>>>
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/ipoib_driver.c
>>> (revision 3107) @@ -150,8 +150,8 @@ {NDIS_STRING_CONST("RqDepth"),
>>> 1,
>>> IPOIB_OFFSET(rq_depth), IPOIB_SIZE(rq_depth), 512,
>>> 128, 1024}, {NDIS_STRING_CONST("RqLowWatermark"), 0,
>>> IPOIB_OFFSET(rq_low_watermark), IPOIB_SIZE(rq_low_watermark),
>>> 4, 2, 8}, {NDIS_STRING_CONST("SqDepth"), 1,
>>> IPOIB_OFFSET(sq_depth), IPOIB_SIZE(sq_depth), 512,
>>> 128, 1024}, - {NDIS_STRING_CONST("SendChksum"), 1,
>>> IPOIB_OFFSET(send_chksum_offload),
>>> IPOIB_SIZE(send_chksum_offload),0, 0, 1},
>>> - {NDIS_STRING_CONST("RecvChksum"), 1,
>>> IPOIB_OFFSET(recv_chksum_offload),
>>> IPOIB_SIZE(recv_chksum_offload),0, 0, 1},
>>> + {NDIS_STRING_CONST("SendChksum"), 1,
>>> IPOIB_OFFSET(send_chksum_offload),
>>>
>> IPOIB_SIZE(send_chksum_offload),CSUM_ENABLED,CSUM_DISABLED,CSUM_BYPASS
>>> },
>>> + {NDIS_STRING_CONST("RecvChksum"), 1,
>>> IPOIB_OFFSET(recv_chksum_offload),
>>>
>> IPOIB_SIZE(recv_chksum_offload),CSUM_ENABLED,CSUM_DISABLED,CSU
>> M_BYPASS},
>>> {NDIS_STRING_CONST("SaTimeout"), 1,
>>> IPOIB_OFFSET(sa_timeout), IPOIB_SIZE(sa_timeout),
>>> 1000, 250, UINT_MAX},
>>> {NDIS_STRING_CONST("SaRetries"), 1,
>>> IPOIB_OFFSET(sa_retry_cnt), IPOIB_SIZE(sa_retry_cnt), 10,
>>> 1, UINT_MAX}, {NDIS_STRING_CONST("RecvRatio"),
>>> 1, IPOIB_OFFSET(recv_pool_ratio),
>>> IPOIB_SIZE(recv_pool_ratio), 1, 1, 10}, @@ -1450,28 +1450,20 @@
>>> p_offload_task->TaskBufferLength =
>>> sizeof(NDIS_TASK_TCP_IP_CHECKSUM);
>>> p_offload_chksum =
>>>
>>> (NDIS_TASK_TCP_IP_CHECKSUM*)p_offload_task->TaskBuffer; - +
>>> p_offload_chksum->V4Transmit.IpOptionsSupported =
>>> - p_adapter->params.send_chksum_offload;
>>> p_offload_chksum->V4Transmit.TcpOptionsSupported =
>>> - p_adapter->params.send_chksum_offload;
>>> p_offload_chksum->V4Transmit.TcpChecksum =
>>> - p_adapter->params.send_chksum_offload;
>>> p_offload_chksum->V4Transmit.UdpChecksum =
>>> - p_adapter->params.send_chksum_offload;
>>> p_offload_chksum->V4Transmit.IpChecksum =
>>> - p_adapter->params.send_chksum_offload;
>>> + !!(p_adapter->params.send_chksum_offload);
>>>
>>> p_offload_chksum->V4Receive.IpOptionsSupported =
>>> - p_adapter->params.recv_chksum_offload;
>>> p_offload_chksum->V4Receive.TcpOptionsSupported =
>>> - p_adapter->params.recv_chksum_offload;
>>> p_offload_chksum->V4Receive.TcpChecksum =
>>> - p_adapter->params.recv_chksum_offload;
>>> p_offload_chksum->V4Receive.UdpChecksum =
>>> - p_adapter->params.recv_chksum_offload;
>>> p_offload_chksum->V4Receive.IpChecksum =
>>> - p_adapter->params.recv_chksum_offload;
>>> + !!(p_adapter->params.recv_chksum_offload);
>>>
>>> p_offload_chksum->V6Transmit.IpOptionsSupported = FALSE;
>>> p_offload_chksum->V6Transmit.TcpOptionsSupported = FALSE;
>>> Index:
>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/netipoib.inx
>>> ===================================================================
>>> --- D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/netipoib.inx
>>> (revision 3106) +++
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/ulp/ipoib/kernel/netipoib.inx
>>> (revision 3107) @@ -95,18 +95,28 @@
>>>
>>> HKR, Ndi\Params\SendChksum, ParamDesc, 0, "Send
>>> Checksum Offload" HKR, Ndi\Params\SendChksum, Type,
>>> 0, "enum" -HKR, Ndi\Params\SendChksum, Default,
>>> 0, "0" +HKR, Ndi\Params\SendChksum, Default, 0,
>>> "1" HKR, Ndi\Params\SendChksum, Optional, 0, "0"
>>> HKR, Ndi\Params\SendChksum\enum, "0", 0, "Disabled"
>>> -HKR, Ndi\Params\SendChksum\enum, "1", 0, "Enabled"
>>> +HKR, Ndi\Params\SendChksum\enum, "1", 0, "Enabled (if
>>> supported by HW)" +HKR, Ndi\Params\SendChksum\enum, "2",
>>> 0, "Bypass"
>>>
>>> HKR, Ndi\Params\RecvChksum, ParamDesc, 0, "Recv
>>> Checksum Offload" HKR, Ndi\Params\RecvChksum, Type,
>>> 0, "enum" -HKR, Ndi\Params\RecvChksum, Default,
>>> 0, "0" +HKR, Ndi\Params\RecvChksum, Default, 0,
>>> "1" HKR, Ndi\Params\RecvChksum, Optional, 0, "0"
>>> HKR, Ndi\Params\RecvChksum\enum, "0", 0, "Disabled"
>>> -HKR, Ndi\Params\RecvChksum\enum, "1", 0, "Enabled"
>>> +HKR, Ndi\Params\RecvChksum\enum, "1", 0, "Enabled (if
>>> supported by HW)" +HKR, Ndi\Params\RecvChksum\enum, "2",
>>> 0, "Bypass"
>>>
>>> +HKR, Ndi\Params\lso, ParamDesc, 0, "Large Send
>>> Offload" +HKR, Ndi\Params\lso, Type, 0, "enum"
>>> +HKR, Ndi\Params\lso, Default, 0, "0"
>>> +HKR, Ndi\Params\lso, Optional, 0, "0"
>>> +HKR, Ndi\Params\lso\enum, "0", 0, "Disabled"
>>> +HKR, Ndi\Params\lso\enum, "1", 0, "Enabled" +
>>> +
>>> HKR, Ndi\Params\SaTimeout, ParamDesc, 0, "SA
>>> Query Timeout (ms)" HKR, Ndi\Params\SaTimeout, Type,
>>> 0, "dword" HKR, Ndi\Params\SaTimeout, Default,
>>> 0, "1000" Index:
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/hw/mlx4/kernel/bus/ib/cq.c
>>> ===================================================================
>>> --- D:/Windows/MLNX_WINOF_CSUM_ARBEL/hw/mlx4/kernel/bus/ib/cq.c
>>> (revision 3106) +++
>>>
>>> D:/Windows/MLNX_WINOF_CSUM_ARBEL/hw/mlx4/kernel/bus/ib/cq.c
>>> (revision 3107) @@ -498,7 +498,7 @@
>>> wc->recv.ud.path_bits = (u8)(cqe->g_mlpath_rqpn & 0x7f);
>>> wc->recv.ud.recv_opt |=
>>> cqe->g_mlpath_rqpn & 0x080 ? IB_RECV_OPT_GRH_VALID : 0;
>>> wc->recv.ud.pkey_index = (u16)(be32_to_cpu(cqe->immed_rss_invalid)
>>> & 0x7f); - wc->csum_ok =
>>> mlx4_ib_ipoib_csum_ok(cqe->ipoib_status,cqe->checksum); +
>>> wc->recv.ud.csum_ok =
>>> mlx4_ib_ipoib_csum_ok(cqe->ipoib_status,cqe->checksum); }
>>> if (!is_send && cqe->rlid == 0){
>>>
>> MLX4_PRINT(TRACE_LEVEL_INFORMATION,MLX4_DBG_CQ,("found
>>> rlid == 0 \n "));
>>> _______________________________________________
>>> ofw mailing list
>>> ofw at lists.openfabrics.org
>>> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
More information about the ofw
mailing list