[ofw] RE: IA64 vstate cmd execution induced system crash

Tzachi Dar tzachid at mellanox.co.il
Thu Sep 20 16:05:27 PDT 2007


Seems fine, please commit.

Thanks
Tzachi 

> -----Original Message-----
> From: Smith, Stan [mailto:stan.smith at intel.com] 
> Sent: Friday, September 21, 2007 12:03 AM
> To: Tzachi Dar
> Cc: ofw at lists.openfabrics.org; arlin.r.davis at intel.com; 
> Woodruff, Robert J
> Subject: IA64 vstate cmd execution induced system crash
> 
> Hello,
>   After poking around with windbg using a minidump file, I 
> found what the problem is.....drum roll, surprise, surprise, 
> an IA64 misaligned access.
> 
> The problem is an unaligned (0xe0000165`3721def9) local MAD 
> is passed into mthca_process_mad(): 'mthca_mad.c'. The first 
> few local MAD header references are to u8 fields, when the 
> attr_id field(be16)is accessed @ line 229 it's unaligned - oops!
> 
> After receiving mentoring from a well qualified 
> openib-windows SW coach, the problem is best fixed in 
> gen1\trunk\inc\iba\ib_al_ioctl.h @ line #2486, the typedef 
> union _ual_local_mad_ioctl definition; see enclosed files as 
> the email tabs/spaces don't really lineup well.
> 
> Testing on IA64 reveals no patch side effects not to mention 
> vstat works as expected.
> Testing on x64 & x86 reveals no changes in system behavior as 
> well as working vstat commands. 
> 
> Please advise if this patch is acceptible and I will commit.
> 
> Stan.
> 
> 
> Index: ib_al_ioctl.h
> ===================================================================
> --- ib_al_ioctl.h	(revision 825)
> +++ ib_al_ioctl.h	(working copy)
> @@ -2298,7 +2298,7 @@
>  		uint64_t					h_av;
>  		ib_mad_element_t* __ptr64		p_mad_element;
>  		uint32_t					size;
> -		void* __ptr64* __ptr64		ph_proxy;
> +		void* __ptr64* __ptr64			ph_proxy;
>  
>  	}	in;
>  	struct _ual_send_mad_ioctl_out
> @@ -2488,14 +2488,15 @@
>  	struct _ual_local_mad_ioctl_in
>  	{
>  		uint64_t				h_ca;
> +		__declspec(align(8)) uint8_t mad_in[MAD_BLOCK_SIZE];
>  		uint8_t				port_num;
> -		uint8_t				mad_in[MAD_BLOCK_SIZE];
>  
>  	}	in;
>  	struct _ual_local_mad_ioctl_out
>  	{
>  		ib_api_status_t			status;
> -		uint8_t				mad_out[MAD_BLOCK_SIZE];
> +		uint32_t				_pad; /* 8-byte
> alignment needed for ia64 */
> +		__declspec(align(8)) uint8_t mad_out[MAD_BLOCK_SIZE];
>  
>  	}	out;
>   
> 
> Offending call chain.
> 
> nt!KeBugCheck2+0x170
> nt!KiSystemServiceHandler+0x190
> nt!RtlpExecuteEmHandlerForException+0x50
> nt!RtlDispatchException+0x580
> nt!KiDispatchException+0x470
> nt!KiExceptionDispatch+0x190
> nt!KiGenericExceptionHandler+0x330
> mthca!mthca_process_mad(struct ib_device * ibdev = 
> 0xe0000165`3862f110, int mad_flags = 0, unsigned char 
> port_num = 0x01 '', struct _ib_wc * in_wc = 
> 0x00000000`00000000, struct _ib_grh * in_grh = 
> 0x00000000`00000000, struct ib_mad * in_mad = 
> 0xe0000165`3721def9, struct ib_mad * out_mad = 
> 0xe0000165`3721def4)+0x710 
> [d:\openib-windows-svn\769\gen1\trunk\hw\mthca\kernel\mthca_ma
> d.c @ 229]
> 
> mthca!mlnx_local_mad(struct _ib_ca * h_ca = 
> 0xe0000165`389dfd70, unsigned char port_num = 0x01 '', struct 
> _ib_av_attr * p_av_attr = 0x00000000`00000000, struct _ib_mad 
> * p_mad_in = 0xe0000165`3721def9, struct _ib_mad * p_mad_out 
> = 0xe0000165`3721def4)+0x760 
> [d:\openib-windows-svn\769\gen1\trunk\hw\mthca\kernel\hca_verb
> s.c @ 1541]
> 
> ibbus!al_local_mad(struct _ib_ca * h_ca = 
> 0xe0000165`373ecf40, unsigned char port_num = 0x01 '', struct 
> _ib_av_attr * p_src_av_attr = 0x00000000`00000000, void * 
> p_mad_in = 0xe0000165`3721def9, void * p_mad_out = 
> 0xe0000165`3721def4)+0x16c0 
> [d:\openib-windows-svn\769\gen1\trunk\core\al\al_mad.c @ 
> 3229] ibbus!ib_local_mad(struct _ib_ca * h_ca = 
> 0xe0000165`373ecf40, unsigned char port_num = 0x01 '', void * 
> p_mad_in = 0xe0000165`3721def9, void * p_mad_out = 
> 0xe0000165`3721def4)+0xf80 
> [d:\openib-windows-svn\769\gen1\trunk\core\al\al_mad.c @ 3188]
> 
> ibbus!proxy_local_mad(void * p_open_context = 
> 0xe0000165`38811140, struct _IRP * h_ioctl = 
> 0xe0000165`373c0cc0, unsigned int64 * p_ret_bytes = 
> 0xe0000165`24ef73a8)+0xbb0 
> [d:\openib-windows-svn\769\gen1\trunk\core\al\kernel\al_proxy_
> subnet.c @ 1077]
> 
> ibbus!subnet_ioctl(struct _IRP * h_ioctl = 
> 0xe0000165`373c0cc0, unsigned
> int64 * p_ret_bytes = 0xe0000165`24ef73a8)+0xc50 
> [d:\openib-windows-svn\769\gen1\trunk\core\al\kernel\al_proxy_
> subnet.c @ 1150]
> 
> ibbus!al_dev_ioctl(struct _IRP * h_ioctl = 
> 0xe0000165`373c0cc0)+0xcd0 
> [d:\openib-windows-svn\769\gen1\trunk\core\al\kernel\al_dev.c @ 460]
> 
> ibbus!bus_drv_ioctl(struct _DEVICE_OBJECT * p_dev_obj = 
> 0xe0000165`386d38a0, struct _IRP * p_irp = 
> 0xe0000165`373c0cc0)+0x8b0 
> [d:\openib-windows-svn\769\gen1\trunk\core\bus\kernel\bus_driv
> er.c @ 402]
> 
> nt!IofCallDriver+0x120
> nt!IopSynchronousServiceTail+0x230
> nt!IopXxxControlFile+0x1140
> nt!NtDeviceIoControlFile+0x80
> nt!KiSystemServiceExit
> 



More information about the ofw mailing list