[ofw] crash in mlx4 driver

Sean Hefty sean.hefty at intel.com
Thu Mar 12 15:16:06 PDT 2009


I hit the following crash in mlx4_hca.  (SVN version was updated last night, so
it's recent.)  My guess is that winverbs may be doing something wrong, but
here's the stack trace and corresponding source code in the mlx4 driver:

DEFAULT_BUCKET_ID:  DRIVER_FAULT

BUGCHECK_STR:  0xBE

PROCESS_NAME:  dtest2d.exe

CURRENT_IRQL:  f

TRAP_FRAME:  fffffadf8e293370 -- (.trap 0xfffffadf8e293370)
NOTE: The trap frame does not contain all registers.
Some register values may be zeroed or incorrect.
rax=fffffadf8a58ac82 rbx=fffffadf8e2934a0 rcx=0000000000000000
rdx=0000000000000000 rsi=0000000000000008 rdi=0000000000000000
rip=fffffadf8a6de8c2 rsp=fffffadf8e293508 rbp=0000000000000008
 r8=0000000000000050  r9=0000000000000000 r10=0000000000000000
r11=0000000000000000 r12=0000000000000000 r13=0000000000000000
r14=0000000000000000 r15=0000000000000000
iopl=0         nv up ei pl zr na po nc
mlx4_hca!atomic_set+0x12:
fffffadf`8a6de8c2 8908            mov     dword ptr [rax],ecx
ds:0008:fffffadf`8a58ac82=89481024
Resetting default scope

LAST_CONTROL_TRANSFER:  from fffff8000107984c to fffff80001026cf0

STACK_TEXT:  
fffffadf`8e292a18 fffff800`0107984c : 0000fadf`8c671f62 00000000`0000edde
00000000`00000000 00000000`00000000 : nt!DbgBreakPointWithStatus
fffffadf`8e292a20 fffff800`010c517e : 00000000`0ee40000 00000000`dffe0000
00000000`0ee40000 fffffadf`9a8e1840 : nt!KdCheckForDebugBreak+0xb5
fffffadf`8e292a60 fffff800`010d89eb : fffffadf`8a58ac00 fffffadf`8e293370
00000000`00000001 00000000`000000be : nt!IoWriteCrashDump+0x851
fffffadf`8e292c20 fffff800`0102e994 : fffffadf`8e293360 00000000`00000000
00000001`00000000 00000000`00000001 : nt!KeBugCheck2+0xb83
fffffadf`8e293260 fffff800`010a5c05 : 00000000`000000be fffffadf`8a58ac82
00000000`c4e84121 fffffadf`8e293370 : nt!KeBugCheckEx+0x104
fffffadf`8e2932a0 fffff800`0102d459 : 00000000`00000001 fffffadf`901ef1cc
fffffadf`8e293800 fffffa80`00100660 : nt!MmAccessFault+0x503
fffffadf`8e293370 fffffadf`8a6de8c2 : fffffadf`8a6deb4c fffffadf`8a58ac82
fffffadf`00000000 fffffadf`8e293538 : nt!KiPageFault+0x119
fffffadf`8e293508 fffffadf`8a6deb4c : fffffadf`8a58ac82 fffffadf`00000000
fffffadf`8e293538 fffff800`01288000 : mlx4_hca!atomic_set+0x12
[c:\mshefty\scm\winof\branches\winverbs\hw\mlx4\kernel\inc\l2w_atomic.h @ 17]
fffffadf`8e293510 fffffadf`8a58cf34 : fffffadf`9a77cd00 fffffadf`8e293610
fffffadf`9aa9b118 fffffadf`97ea5910 : mlx4_hca!mlnx_um_open+0x27c
[c:\mshefty\scm\winof\branches\winverbs\hw\mlx4\kernel\hca\vp.c @ 99]
fffffadf`8e293590 fffffadf`8a58d0ea : fffffadf`9aa9b0f0 20a60000`03c90200
fffffadf`8e293610 00000000`00000038 : winverbs!WvDeviceInit+0x84
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\kernel\wv_device.c @ 249]
fffffadf`8e2935e0 fffffadf`8a58b0b5 : fffffadf`9a2d90d0 00000520`6815a6e8
00000000`00000000 fffffadf`97ea5910 : winverbs!WvDeviceOpen+0x11a
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\kernel\wv_device.c @ 299]
fffffadf`8e293670 fffffadf`9051e0b9 : 00000520`66e88398 00000520`6815a6e8
00000000`00000048 00000000`00000010 : winverbs!WvIoDeviceControl+0xa5
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\kernel\wv_driver.c @ 224]
fffffadf`8e2936c0 fffffadf`9051d59e : 00000520`6815a6e8 00000520`6815a6e8
fffffadf`9991ba90 fffffadf`97ea5910 :
wdf01000!FxIoQueue::DispatchRequestToDriver+0x6d9
fffffadf`8e293760 fffffadf`9051c8b6 : fffffadf`99177c60 00000000`00000000
fffffadf`99177c00 fffffadf`97f00021 : wdf01000!FxIoQueue::DispatchEvents+0x83e
fffffadf`8e2938c0 fffffadf`90523998 : fffffadf`98791c00 fffffadf`98791cb0
00000520`66e88398 00000520`6815a6e8 : wdf01000!FxIoQueue::QueueRequest+0x4a6
fffffadf`8e293970 fffffadf`90507865 : fffffadf`96588d0e fffffadf`97ea5910
fffffadf`98791cb0 fffffadf`9a6a9480 : wdf01000!FxPkgIo::Dispatch+0x718
fffffadf`8e293a40 fffff800`0127f111 : 00000000`00000010 fffffadf`8e293cf0
00000000`00000000 fffffadf`9a65bf40 : wdf01000!FxDevice::Dispatch+0xa9
fffffadf`8e293a70 fffff800`0127ec16 : 00000000`00000000 00000000`00000341
00000000`00000000 00000000`00000000 : nt!IopXxxControlFile+0xa79
fffffadf`8e293b90 fffff800`0102e33d : 00000000`00000000 fffffadf`8e293c40
fffffadf`00000000 00000000`00000000 : nt!NtDeviceIoControlFile+0x56
fffffadf`8e293c00 00000000`77ef0a5a : 00000000`77d5effa 00000000`00000000
00000000`00000000 00000000`000af1e0 : nt!KiSystemServiceCopyEnd+0x3
00000000`000af028 00000000`77d5effa : 00000000`00000000 00000000`00000000
00000000`000af1e0 00000000`000c02c8 : ntdll!NtDeviceIoControlFile+0xa
00000000`000af030 00000000`00492548 : 00000000`000ce300 00000000`00000000
00000001`00001290 00000000`01d15eec : kernel32!DeviceIoControl+0x163
00000000`000af210 00000000`00493943 : 00000000`000ccae8 00000000`0000034c
00000000`003be00c 00000000`000af310 : winverbsd!CWVBase::WvDeviceIoControl+0x88
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\user\wv_base.cpp @ 95]
00000000`000af270 00000000`00497cc6 : 00000000`000ccae0 20a60000`03c90200
00000000`00000000 00000000`00000000 : winverbsd!CWVDevice::Open+0x363
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\user\wv_device.cpp @ 105]
00000000`000af400 00000000`00498457 : 00000000`000cc2a0 20a60000`03c90200
00000000`000af4a8 00000001`00001290 : winverbsd!CWVDevice::CreateInstance+0x96
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\user\wv_device.h @ 85]
00000000`000af450 00000000`0049824d : 00000000`000cc2a0 20a60000`03c90200
00000000`000af4a8 00002b99`00000000 : winverbsd!CWVProvider::OpenDevice+0x27
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\user\wv_provider.cpp @
181]
00000000`000af480 00000000`003f4b16 : 00000000`000cc2a0 20a60000`03c90200
00000000`000af4f0 00000000`00000000 : winverbsd!CWVProvider::QueryDevice+0x2d
[c:\mshefty\scm\winof\branches\winverbs\core\winverbs\user\wv_provider.cpp @
125]
00000000`000af4c0 00000000`003db693 : 00000000`00000000 00000000`003c4898
00000000`002abfe0 00000000`002e98e0 : libibverbsd!ibv_get_device_list+0x266
[c:\mshefty\scm\winof\branches\winverbs\ulp\libibverbs\src\device.cpp @ 140]
00000000`000af640 00000000`003cf1f2 : 00000000`002abfe0 00000000`002e98e0
00000001`0000b420 00000000`00000008 : dapl2_ofa_scmd!dapls_ib_open_hca+0x43
[c:\mshefty\scm\winof\branches\winverbs\ulp\dapl2\dapl\openib_scm\dapl_ib_util.c
@ 255]
00000000`000af6b0 00000000`00402dde : 00000001`0000b420 00000000`00000008
00000001`0000bc50 00000001`0000bc28 : dapl2_ofa_scmd!dapl_ia_open+0x112
[c:\mshefty\scm\winof\branches\winverbs\ulp\dapl2\dapl\common\dapl_ia_open.c @
146]
00000000`000af720 00000001`00003bb9 : 00000001`0000b420 00000000`00000008
00000001`0000bc50 00000001`0000bc28 : dat2d!dat_ia_openv+0x17e
[c:\mshefty\scm\winof\branches\winverbs\ulp\dapl2\dat\udat\udat.c @ 234]
00000000`000afb60 00000001`00009789 : 00000000`00000003 00000000`002aaea0
00000000`00000000 00000000`00000001 : dtest2d!main+0x4b9
[c:\mshefty\scm\winof\branches\winverbs\ulp\dapl2\test\dtest\dtest.c @ 342]
00000000`000aff40 00000000`77d5964c : 00000000`00000000 00000000`00000000
00000000`00000000 00000000`000affa8 : dtest2d!__mainCRTStartup+0x13d
[d:\longhorn_rc0\base\crts\crtw32\dllstuff\crtexe.c @ 716]
00000000`000aff80 00000000`00000000 : 00000001`000098f4 00000000`00000000
00000000`00000000 00000000`00000000 : kernel32!BaseProcessStart+0x29


STACK_COMMAND:  kb

FOLLOWUP_IP: 
mlx4_hca!atomic_set+12
[c:\mshefty\scm\winof\branches\winverbs\hw\mlx4\kernel\inc\l2w_atomic.h @ 17]
fffffadf`8a6de8c2 8908            mov     dword ptr [rax],ecx

FAULTING_SOURCE_CODE:  
    13: }
    14: 
    15: static inline void atomic_set(atomic_t *pval, long val)
    16: {
>   17: 	*pval = (__int32)val;
    18: }
    19: 
    20: /**
    21: * atomic_inc_and_test - decrement and test
    22: * pval: pointer of type atomic_t


SYMBOL_STACK_INDEX:  7

SYMBOL_NAME:  mlx4_hca!atomic_set+12

FOLLOWUP_NAME:  MachineOwner

MODULE_NAME: mlx4_hca

IMAGE_NAME:  mlx4_hca.sys

DEBUG_FLR_IMAGE_TIMESTAMP:  49ad96d9

FAILURE_BUCKET_ID:  X64_0xBE_mlx4_hca!atomic_set+12

BUCKET_ID:  X64_0xBE_mlx4_hca!atomic_set+12

Followup: MachineOwner
---------

**** source ****
note: p_umv_buf->command == 1

static ib_api_status_t
mlnx_um_open(
	IN		const	ib_ca_handle_t				h_ca,
	IN	OUT			ci_umv_buf_t* const
p_umv_buf,
		OUT			ib_ca_handle_t* const		ph_um_ca
)
{
	ib_api_status_t		status;
	mlnx_hca_t			*p_hca = (mlnx_hca_t *)h_ca;
	PFDO_DEVICE_DATA p_fdo = hca2fdo(p_hca);
	struct ib_device *p_ibdev = hca2ibdev(p_hca);
	struct ib_ucontext *p_uctx;
	struct ibv_get_context_resp *p_uresp;

	HCA_ENTER(HCA_DBG_SHIM);

	// sanity check
	ASSERT( p_umv_buf );
	if( !p_umv_buf->command )
	{ // no User Verb Provider
		p_uctx = cl_zalloc( sizeof(struct ib_ucontext) );
		if( !p_uctx )
		{
			status = IB_INSUFFICIENT_MEMORY;
			goto err_alloc_ucontext;
		}
		/* Copy the dev info. */
		p_uctx->device = p_ibdev;
		p_umv_buf->output_size = 0;
		status = IB_SUCCESS;
		goto done;
	}

	// sanity check
	if ( p_umv_buf->output_size < sizeof(struct ibv_get_context_resp) ||
		!p_umv_buf->p_inout_buf) {
		status = IB_INVALID_PARAMETER;
		goto err_inval_params;
	}

	status = ibv_um_open( p_ibdev, p_umv_buf, &p_uctx );
	if (!NT_SUCCESS(status)) {
		goto end;
	}
	
	// fill more parameters for user (sanity checks are in
mthca_alloc_ucontext) 
	p_uresp = (struct ibv_get_context_resp
*)(ULONG_PTR)p_umv_buf->p_inout_buf;
	p_uresp->vend_id		 =
(uint32_t)p_fdo->bus_ib_ifc.pdev->ven_id;
	p_uresp->dev_id			 =
(uint16_t)p_fdo->bus_ib_ifc.pdev->dev_id;
	p_uresp->max_qp_wr		 = hca2mdev(p_hca)->caps.max_wqes;
	p_uresp->max_cqe		 = hca2mdev(p_hca)->caps.max_cqes;
	p_uresp->max_sge		 = min( hca2mdev(p_hca)->caps.max_sq_sg,
		hca2mdev(p_hca)->caps.max_rq_sg );

done:
	// fill the rest of ib_ucontext_ex fields 
	atomic_set(&p_uctx->x.usecnt, 0);

***** crash here ^^^, p_uctx is not NULL, but apparently invalid

	p_uctx->x.va = p_uctx->x.p_mdl = NULL;
	p_uctx->x.fw_if_open = FALSE;
	mutex_init( &p_uctx->x.mutex );

	// chain user context to the device
	spin_lock( &p_fdo->uctx_lock );
	cl_qlist_insert_tail( &p_fdo->uctx_list, &p_uctx->x.list_item );
	cl_atomic_inc(&p_fdo->usecnt);
	spin_unlock( &p_fdo->uctx_lock );
	
	// return the result
	if (ph_um_ca) *ph_um_ca = (ib_ca_handle_t)p_uctx;

	status = IB_SUCCESS;
	goto end;

err_inval_params:
err_alloc_ucontext:
end:
	if (p_umv_buf && p_umv_buf->command) 
		p_umv_buf->status = status;
	if (status != IB_SUCCESS) 
	{
		HCA_PRINT(TRACE_LEVEL_ERROR,HCA_DBG_SHIM,
			("completes with ERROR status %x\n", status));
	}
	HCA_EXIT(HCA_DBG_SHIM);
	return status;
}

I'll keep looking into this, but if anyone has any ideas, please let me know.
(This crash occurred after running a bunch of random
libibverbs/librdmacm/perftest tests, followed by running dtest successfully
once.  The second running of dtest generated this crash.

- Sean




More information about the ofw mailing list