[ofw] [SRP] recovery thread
Yossi Leybovich
sleybo at dev.mellanox.co.il
Wed Feb 27 05:20:04 PST 2008
See my comments inside.
I will start to test it today
Thanks
Yossi
> -----Original Message-----
> From: ofw-bounces at lists.openfabrics.org [mailto:ofw-
> bounces at lists.openfabrics.org] On Behalf Of Alex Estrin
> Sent: Monday, February 25, 2008 4:48 PM
> To: ofw at lists.openfabrics.org
> Subject: [ofw] [SRP] recovery thread
>
> Hello,
>
> Proposed patch is an implementation of failed session recovery thread
> (one thread per session).
> Some connection functions were reworked to get more discrete control
> over session's state.
> Most of data path error states now handled.
> Also some other necessary minor changes were included:
> - fixed debug mode statistics to handle more than one session per
> IOC.
> - connection request response timeout hardcoded numbers replaced
> with calculated path roundtrip time values.
> - added limitation for pending queue growth.
> - couple minor typos.
>
> Please review.
>
> Thanks,
> Alex.
>
Snip .......
> +
> +static void
> +__srp_session_recovery(
> +IN srp_session_t* p_failed_session ,
> +IN BOOLEAN reconnect_request )
> +{
> + ib_api_status_t ib_status;
> + srp_hba_t* p_hba;
> + srp_session_t* p_new;
> + srp_session_t* p_old;
> + UCHAR target_id;
> + int retry_count;
> +
> + SRP_ENTER( SRP_DBG_SESSION );
> +
> + if( !p_failed_session )
> + return;
> + if ( ( p_hba = p_failed_session->p_hba ) == NULL )
> + return;
> +
> + p_old = p_failed_session;
> + target_id = p_old->target_id;
> + p_hba->session_list[target_id] = NULL;
> +
> + if( !reconnect_request )
> + {
> + /* we're done here */
> + SRP_PRINT( TRACE_LEVEL_WARNING, SRP_DBG_SESSION,
> + ("Session Id: %d won't recover\n",
> p_old->target_id ) );
> + cl_obj_destroy( &p_old->obj );
> + return;
> + }
> +
> + if( !p_hba->session_paused[target_id] )
> + {
> + p_hba->session_paused[target_id] = TRUE;
> +
> + StorPortDeviceBusy( p_hba->p_ext,
> + SP_UNTAGGED,
> + target_id,
> + SP_UNTAGGED,
> + (ULONG)-1 );
> + }
> +
[Yossi Leybovich]
Don't you think we need to issue StorPortDeviceBusy first (while checking
the return value and only then
Set the session to pause?
Although I don't know what we should do in case StorPortDeviceBusy return
FALSE
> + SRP_PRINT( TRACE_LEVEL_WARNING, SRP_DBG_SESSION,
> + ("Pausing Adapter Session %d for %s.\n", target_id,
> + p_hba->ioc_info.profile.id_string ) );
> +
> + cl_obj_destroy( &p_old->obj );
> +
> + for( retry_count=0; retry_count < 3 ; retry_count++ )
> + {
> + ib_status = srp_session_connect( p_hba, &p_new,
> target_id );
> +
> + if( ib_status != IB_SUCCESS )
> + {
> + SRP_PRINT( TRACE_LEVEL_WARNING, SRP_DBG_SESSION,
> + ("Failed session idx %d connect\n",
> target_id ) );
> + continue;
> + }
> +
> + p_hba->session_list[target_id] = p_new;
> + p_hba->session_paused[target_id] = FALSE;
> +
> + SRP_PRINT( TRACE_LEVEL_WARNING, SRP_DBG_SESSION,
> + ("Session idx %d connected. Resuming\n",
> target_id ) );
> +
> + StorPortDeviceReady( p_hba->p_ext,
> + SP_UNTAGGED,
> + target_id,
> + SP_UNTAGGED );
> +
[Yossi Leybovich]
Shouldn't we check the return value of StorPortDeviceReady and notify in
case of error?
> + return;
> + }
> +
> + /* what do we do now ? */
> + SRP_PRINT( TRACE_LEVEL_WARNING, SRP_DBG_SESSION,
> + ("Session idx %d recovery failed\n",
> target_id));
> +
> + return;
> +}
> +
> +void
> +srp_session_recovery_thread(
> + IN void* const context )
> +{
> + srp_session_t* p_session = (srp_session_t *)context;
> + cl_status_t status;
> +
> + if( p_session == NULL )
> + return;
> +
> + cl_event_init( &p_session->offload_event, TRUE );
> +
> + status = cl_event_wait_on( &p_session->offload_event,
> EVENT_NO_TIMEOUT, FALSE );
> +
> + __srp_session_recovery( p_session,
> !p_session->p_hba->adapter_stopped );
> +}
Snip................
> Index: srp_connection.c
> ===================================================================
> --- srp_connection.c (revision 945)
> +++ srp_connection.c (working copy)
> @@ -51,6 +51,12 @@
>
> #include <complib/cl_math.h>
>
> +#if DBG
> +
> +extern void* gp_session[SRP_MAX_SERVICE_ENTRIES];
> +
> +#endif
> +
> /* __srp_create_cqs */
> /*!
> Creates the send/recv completion queues to be used by this connection
> @@ -284,26 +290,9 @@
> srp_hba_t *p_hba = p_srp_session->p_hba;
> ib_cm_drep_t cm_drep;
> ib_api_status_t status;
> - int i;
> - int retry_count = 0;
>
> SRP_ENTER( SRP_DBG_PNP );
>
> - SRP_PRINT( TRACE_LEVEL_INFORMATION, SRP_DBG_DEBUG,
> - ("Target has issued a disconnect request.\n") );
> -
> - if ( p_hba->adapter_paused == FALSE )
> - {
> - p_hba->adapter_paused = TRUE;
> - StorPortBusy( p_hba->p_ext, (ULONG)-1 );
> - StorPortCompleteRequest( p_hba->p_ext,
> -
> SP_UNTAGGED,
> -
> SP_UNTAGGED,
> -
> SP_UNTAGGED,
> -
> SRB_STATUS_BUSY );
> - SRP_PRINT( TRACE_LEVEL_INFORMATION, SRP_DBG_DEBUG,
> ("Pausing Adapter for %s.\n", p_hba->ioc_info.profile.id_string) );
> - }
> -
> cl_obj_lock( &p_srp_session->obj );
> p_srp_session->connection.state = SRP_TARGET_DISCONNECTING;
> cl_obj_unlock( &p_srp_session->obj );
> @@ -316,92 +305,19 @@
> {
> SRP_PRINT( TRACE_LEVEL_ERROR, SRP_DBG_ERROR,
> ("Cannot respond to target disconnect request.
> Status = %d\n", status) );
> + return;
> }
>
Why the return ? I think we still need to destroy the session although we
could not send the drep.
> - cl_obj_lock( &p_hba->obj );
> + SRP_PRINT( TRACE_LEVEL_WARNING, SRP_DBG_PNP,
> + ("Target has issued a disconnect request for Session %d
> ref_cnt = %d.\n",
> +
> p_srp_session->target_id,
> +
> p_srp_session->obj.ref_cnt) );
>
> - for ( i = 0; i < SRP_MAX_SERVICE_ENTRIES; i++ )
> + if( !p_hba->adapter_stopped )
> {
> - if ( p_srp_session == p_hba->session_list[i] )
> - {
> - p_hba->session_list[i] = NULL;
> - break;
> - }
> + srp_session_failed( p_srp_session );
> }
>
> - cl_obj_unlock( &p_hba->obj );
> -
> - SRP_PRINT( TRACE_LEVEL_VERBOSE, SRP_DBG_DEBUG,
> - ("Session Object ref_cnt = %d\n",
> p_srp_session->obj.ref_cnt) );
> - cl_obj_destroy( &p_srp_session->obj );
> -
> - do
> - {
> - retry_count++;
> -
> - SRP_PRINT( TRACE_LEVEL_INFORMATION, SRP_DBG_DEBUG,
> - ("Attempting to reconnect %s. Connection Attempt
> Count = %d.\n",
> - p_hba->ioc_info.profile.id_string,
> - retry_count) );
> -
> - SRP_PRINT( TRACE_LEVEL_VERBOSE, SRP_DBG_DEBUG,
> - ("Creating New Session For Service Entry Index
> %d.\n",
> - p_hba->ioc_info.profile.num_svc_entries));
> - p_srp_session = srp_new_session(
> - p_hba, &p_hba->p_svc_entries[i], &status );
> - if ( p_srp_session == NULL )
> - {
> - status = IB_INSUFFICIENT_MEMORY;
> - break;
> - }
> -
> - SRP_PRINT( TRACE_LEVEL_VERBOSE, SRP_DBG_DEBUG,
> - ("New Session For Service Entry Index %d
> Created.\n",
> - p_hba->ioc_info.profile.num_svc_entries));
> - SRP_PRINT( TRACE_LEVEL_VERBOSE, SRP_DBG_DEBUG,
> - ("Logging Into Session.\n"));
> - status = srp_session_login( p_srp_session );
> - if ( status == IB_SUCCESS )
> - {
> - if ( p_hba->max_sg >
> p_srp_session->connection.max_scatter_gather_entries )
> - {
> - p_hba->max_sg =
> p_srp_session->connection.max_scatter_gather_entries;
> - }
> -
> - if ( p_hba->max_srb_ext_sz >
> p_srp_session->connection.init_to_targ_iu_sz )
> - {
> - p_hba->max_srb_ext_sz =
> - sizeof( srp_send_descriptor_t )
> -
> - SRP_MAX_IU_SIZE +
> -
> p_srp_session->connection.init_to_targ_iu_sz;
> - }
> -
> - cl_obj_lock( &p_hba->obj );
> - p_hba->session_list[i] = p_srp_session;
> - cl_obj_unlock( &p_hba->obj );
> -
> - SRP_PRINT( TRACE_LEVEL_VERBOSE, SRP_DBG_DEBUG,
> - ("Session Login Issued
> Successfully.\n"));
> - }
> - else
> - {
> - SRP_PRINT( TRACE_LEVEL_ERROR, SRP_DBG_ERROR,
> - ("Session Login Failure Status = %d.\n",
> status));
> - SRP_PRINT( TRACE_LEVEL_VERBOSE, SRP_DBG_DEBUG,
> - ("Session Object ref_cnt = %d\n",
> p_srp_session->obj.ref_cnt) );
> - cl_obj_destroy( &p_srp_session->obj );
> - }
> - } while ( (status != IB_SUCCESS) && (retry_count < 3) );
> -
> - if ( status == IB_SUCCESS )
> - {
> - SRP_PRINT( TRACE_LEVEL_INFORMATION, SRP_DBG_DEBUG,
> - ("Resuming Adapter for %s.\n",
> p_hba->ioc_info.profile.id_string) );
> - p_hba->adapter_paused = FALSE;
> - StorPortReady( p_hba->p_ext );
> -// StorPortNotification( BusChangeDetected, p_hba->p_ext, 0
> );
> - }
> -
> SRP_EXIT( SRP_DBG_PNP );
> }
>
> @@ -443,22 +359,26 @@
>
> p_connection->request_limit =
> MIN( get_srp_login_response_request_limit_delta(
> p_srp_login_rsp ), SRP_DEFAULT_RECV_Q_DEPTH );
> +
> + if( ib_ioc_profile_get_vend_id(
> &p_srp_session->p_hba->ioc_info.profile) == 0x00066a &&
> + cl_ntoh32(
> p_srp_session->p_hba->ioc_info.profile.subsys_id ) == 0x38 )
> + {
> + /* workaround for FVIC */
> + p_connection->request_limit /= 2;
> + }
>
> + p_connection->max_limit = p_connection->request_limit;
> p_connection->request_threashold = 2;
> #if DBG
> p_srp_session->x_req_limit = p_connection->request_limit; #endif
> - SRP_PRINT( TRACE_LEVEL_INFORMATION, SRP_DBG_DEBUG,
> - ( "request_limit_delta %d, SRP_DEFAULT_RECV_Q_DEPTH %d,
> request_threashold %d\n",
> - get_srp_login_response_request_limit_delta(
> p_srp_login_rsp ),
> - SRP_DEFAULT_RECV_Q_DEPTH,
> p_connection->request_threashold ));
>
> p_connection->send_queue_depth = p_connection->request_limit;
> p_connection->recv_queue_depth = p_connection->request_limit;
> p_connection->init_to_targ_iu_sz =
> get_srp_login_response_max_init_to_targ_iu( p_srp_login_rsp );
> p_connection->targ_to_init_iu_sz =
> get_srp_login_response_max_targ_to_init_iu( p_srp_login_rsp );
>
> - p_connection->signaled_send_completion_count =
> p_connection->send_queue_depth / 2;
> + p_connection->signaled_send_completion_count = 32;
>
Can you explain why did you change it ?
> if (( p_connection->descriptor_format &
> DBDF_INDIRECT_DATA_BUFFER_DESCRIPTORS ) ==
> DBDF_INDIRECT_DATA_BUFFER_DESCRIPTORS )
> {
> @@ -735,9 +655,9 @@
>
> cm_req.init_depth = send_msg_depth;
>
> - cm_req.remote_resp_timeout = 15;
> + cm_req.remote_resp_timeout = ib_path_rec_pkt_life(
> p_connection->p_path_rec ) + 1;
> cm_req.flow_ctrl = FALSE;
> - cm_req.local_resp_timeout = 16;
> + cm_req.local_resp_timeout = ib_path_rec_pkt_life(
> p_connection->p_path_rec ) + 1;
Its hard to decide about this timeout as its not just the round trip of the
destination but also the process time of the CM
I think we might want to have more time here.
> cm_req.retry_cnt = 1;
> cm_req.rnr_nak_timeout = 0; /* 655.36 ms */
> cm_req.rnr_retry_cnt = 6;
> @@ -900,6 +820,13 @@
> status = IB_ERROR;
> goto exit;
> }
> +
> + cl_thread_init( &p_session->recovery_thread,
> +
> (cl_pfn_thread_callback_t)srp_session_recovery_thread,
> + (void *)p_session, "srp_thread"
> );
> +#if DBG
> + gp_session[p_session->target_id] = p_session; #endif
>
> exit:
> SRP_EXIT( SRP_DBG_PNP );
> Index: srp_connection.h
> ===================================================================
> --- srp_connection.h (revision 945)
> +++ srp_connection.h (working copy)
> @@ -80,6 +80,7 @@
>
> atomic32_t tag;
> atomic32_t request_limit;
> + atomic32_t max_limit;
> int32_t request_threashold;
> uint32_t init_to_targ_iu_sz;
> uint32_t targ_to_init_iu_sz;
More information about the ofw
mailing list