[ofw] [PATCH 6/9] dapl-2.0: scm: cr_thread occasionally segv's when disconnecting all-to-all MPI static connections

Davis, Arlin R arlin.r.davis at intel.com
Wed May 19 11:25:22 PDT 2010


Note: no valid calltrace for segv on cr_thread because
of state changing in switch statement from another
thread, jumped unknown location.

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x41a65940 (LWP 1328)]
0x00002b2e7d9d5134 in ?? ()

Add cm object locking on all state change/checking. When
freeing CM object wakeup cr_thread to process
state change to CM_FREE.

Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
 dapl/openib_scm/cm.c |   39 ++++++++++++++++++++++++++++++++-------
 1 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 4c8d4a1..975ffd5 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -436,6 +436,7 @@ void dapls_cm_free(dp_ib_cm_handle_t cm_ptr)
 	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_FREE;
 	while (cm_ptr->ref_count != 1) {
+		dapli_cm_thread_signal(cm_ptr);
 		dapl_os_unlock(&cm_ptr->lock);
 		dapl_os_sleep_usec(10000);
 		dapl_os_lock(&cm_ptr->lock);
@@ -524,7 +525,9 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
 		goto bail;
 	}
 
+	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_REP_PENDING;
+	dapl_os_unlock(&cm_ptr->lock);
 
 	/* send qp info and pdata to remote peer */
 	exp = sizeof(ib_cm_msg_t) - DCM_MAX_PDATA_SIZE;
@@ -836,7 +839,10 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
 	dapl_dbg_log(DAPL_DBG_TYPE_EP, " connect_rtu: send RTU\n");
 
 	/* complete handshake after final QP state change, Just ver+op */
+	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_CONNECTED;
+	dapl_os_unlock(&cm_ptr->lock);
+
 	cm_ptr->msg.op = ntohs(DCM_RTU);
 	if (send(cm_ptr->socket, (char *)&cm_ptr->msg, 4, 0) == -1) {
 		int err = dapl_socket_errno();
@@ -914,7 +920,10 @@ bail:
 		goto ud_bail;
 #endif
 	/* close socket, and post error event */
+	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_REJECTED;
+	dapl_os_unlock(&cm_ptr->lock);
+
 	dapl_evd_connection_callback(NULL, event, cm_ptr->msg.p_data,
 				     DCM_MAX_PDATA_SIZE, ep_ptr);
 	dapli_cm_free(cm_ptr);
@@ -1093,8 +1102,9 @@ static void dapli_socket_accept_data(ib_cm_srvc_handle_t acm_ptr)
 		}
 		p_data = acm_ptr->msg.p_data;
 	}
-
+	dapl_os_lock(&acm_ptr->lock);
 	acm_ptr->state = DCM_ACCEPTING_DATA;
+	dapl_os_unlock(&acm_ptr->lock);
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM,
 		     " ACCEPT: DST %s %x lid=0x%x, qpn=0x%x, psz=%d\n",
@@ -1235,7 +1245,9 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr,
 	dapl_os_memcpy(local.resv, cm_ptr->msg.resv, 4); 
 #endif
 	cm_ptr->hca = ia_ptr->hca_ptr;
+	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_ACCEPTED;
+	dapl_os_unlock(&cm_ptr->lock);
 
 	/* Link CM to EP, already queued on work thread */
 	dapl_ep_link_cm(ep_ptr, cm_ptr);
@@ -1305,7 +1317,9 @@ static void dapli_socket_accept_rtu(dp_ib_cm_handle_t cm_ptr)
 	}
 
 	/* save state and reference to EP, queue for disc event */
+	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_CONNECTED;
+	dapl_os_unlock(&cm_ptr->lock);
 
 	/* final data exchange if remote QP state is good to go */
 	dapl_dbg_log(DAPL_DBG_TYPE_EP, " PASSIVE: connected!\n");
@@ -1368,7 +1382,10 @@ bail:
 	if (cm_ptr->msg.saddr.ib.qp_type == IBV_QPT_UD) 
 		goto ud_bail;
 #endif
+	dapl_os_lock(&cm_ptr->lock);
 	cm_ptr->state = DCM_REJECTED;
+	dapl_os_unlock(&cm_ptr->lock);
+
 	dapls_cr_callback(cm_ptr, event, NULL, 0, cm_ptr->sp);
 	dapli_cm_free(cm_ptr);
 }
@@ -1759,47 +1776,55 @@ void cr_thread(void *arg)
 				     cr->socket);
 
 			/* data on listen, qp exchange, and on disc req */
+			dapl_os_lock(&cr->lock);
 			if ((ret == DAPL_FD_READ) || 
 			    (cr->state != DCM_CONN_PENDING && ret == DAPL_FD_ERROR)) {
 				if (cr->socket != DAPL_INVALID_SOCKET) {
 					switch (cr->state) {
 					case DCM_LISTEN:
+						dapl_os_unlock(&cr->lock);
 						dapli_socket_accept(cr);
-						break;
+                                                break;
 					case DCM_ACCEPTING:
+						dapl_os_unlock(&cr->lock);
 						dapli_socket_accept_data(cr);
 						break;
 					case DCM_ACCEPTED:
+						dapl_os_unlock(&cr->lock);
 						dapli_socket_accept_rtu(cr);
 						break;
 					case DCM_REP_PENDING:
+						dapl_os_unlock(&cr->lock);
 						dapli_socket_connect_rtu(cr);
 						break;
 					case DCM_CONNECTED:
+						dapl_os_unlock(&cr->lock);
 						dapli_socket_disconnect(cr);
 						break;
 					default:
+						dapl_os_unlock(&cr->lock);
 						break;
 					}
-				}
+				} else 
+					dapl_os_unlock(&cr->lock);
+
 			/* ASYNC connections, writable, readable, error; check status */
 			} else if (ret == DAPL_FD_WRITE ||
 				   (cr->state == DCM_CONN_PENDING && 
 				    ret == DAPL_FD_ERROR)) {
-
-			        if (ret == DAPL_FD_ERROR)
-					dapl_log(DAPL_DBG_TYPE_ERR, " CONN_PENDING - FD_ERROR\n");
 				
 				opt = 0;
 				opt_len = sizeof(opt);
 				ret = getsockopt(cr->socket, SOL_SOCKET,
 						 SO_ERROR, (char *)&opt,
 						 &opt_len);
+				dapl_os_unlock(&cr->lock);
 				if (!ret && !opt)
 					dapli_socket_connected(cr, opt);
 				else
 					dapli_socket_connected(cr, opt ? opt : dapl_socket_errno());
-			} 
+			} else 
+				dapl_os_unlock(&cr->lock);
 
 			dapls_cm_release(cr); /* release ref */
 			dapl_os_lock(&hca_ptr->ib_trans.lock);
-- 
1.5.2.5




More information about the ofw mailing list