[openib-general] [PATCH 3/7] AMSO1100 Work Request Definitions REPOST

Fri Mar 24 10:15:34 PST 2006

    Sean> Does anyone know if using packed when it's not needed
    Sean> results in less efficient code?

Yes, it definitely does on some (non-mainstream) architectures.  We
talked about this before I think...

...ah yes: http://article.gmane.org/gmane.linux.drivers.openib/8396

The assembly there came from compiling with no optimization, but if
anything the packed version in that code:

    struct foo { int a; };
    struct bar { int b; } __attribute__((packed));

    int c(struct foo *x) { return x->a; }
    int d(struct bar *x) { return x->b; }

looks worse with -O2.  ia64 compiled with -O2 goes from one bundle to six:

    0000000000000000 <c>:
       0:	13 40 00 40 10 10 	[MBB]       ld4 r8=[r32]
       6:	00 00 00 00 10 80 	            nop.b 0x0
       c:	08 00 84 00       	            br.ret.sptk.many b0;;

    0000000000000010 <d>:
      10:	09 70 00 40 00 21 	[MMI]       mov r14=r32
      16:	f0 10 80 00 42 00 	            adds r15=2,r32
      1c:	34 00 01 84       	            adds r32=3,r32;;
      20:	19 80 04 1c 00 14 	[MMB]       ld1 r16=[r14],1
      26:	f0 00 3c 00 20 00 	            ld1 r15=[r15]
      2c:	00 00 00 20       	            nop.b 0x0;;
      30:	09 70 00 1c 00 10 	[MMI]       ld1 r14=[r14]
      36:	80 00 80 00 20 e0 	            ld1 r8=[r32]
      3c:	f1 78 bd 53       	            shl r15=r15,16;;
      40:	01 00 00 00 01 00 	[MII]       nop.m 0x0
      46:	e0 70 dc ee 29 00 	            shl r14=r14,8
      4c:	81 38 9d 53       	            shl r8=r8,24;;
      50:	0b 70 40 1c 0e 20 	[MMI]       or r14=r16,r14;;
      56:	f0 70 3c 1c 40 00 	            or r15=r14,r15
      5c:	00 00 04 00       	            nop.i 0x0;;
      60:	11 00 00 00 01 00 	[MIB]       nop.m 0x0
      66:	80 78 20 1c 40 80 	            or r8=r15,r8
      6c:	08 00 84 00       	            br.ret.sptk.many b0;;

and sparc64 goes similarly crazy:

    0000000000000000 <c>:
       0:	81 c3 e0 08 	retl 
       4:	d0 42 00 00 	ldsw  [ %o0 ], %o0
       8:	30 68 00 06 	b,a   %xcc, 20 <d>

    0000000000000020 <d>:
      20:	c6 0a 00 00 	ldub  [ %o0 ], %g3
      24:	c2 0a 20 01 	ldub  [ %o0 + 1 ], %g1
      28:	c4 0a 20 02 	ldub  [ %o0 + 2 ], %g2
      2c:	87 28 f0 18 	sllx  %g3, 0x18, %g3
      30:	d0 0a 20 03 	ldub  [ %o0 + 3 ], %o0
      34:	83 28 70 10 	sllx  %g1, 0x10, %g1
      38:	82 10 40 03 	or  %g1, %g3, %g1
      3c:	85 28 b0 08 	sllx  %g2, 8, %g2
      40:	84 10 80 01 	or  %g2, %g1, %g2
      44:	90 12 00 02 	or  %o0, %g2, %o0
      48:	81 c3 e0 08 	retl 
      4c:	91 3a 20 00 	sra  %o0, 0, %o0
      50:	30 68 00 04 	b,a   %xcc, 60 <d+0x40>

Note that mainstream architectures that handle unaligned accesses
sanely do fine with packed.  eg ppc64:

    0000000000000000 <.c>:
       0:	e8 63 00 02 	lwa     r3,0(r3)
       4:	4e 80 00 20 	blr

    0000000000000014 <.d>:
      14:	e8 63 00 02 	lwa     r3,0(r3)
      18:	4e 80 00 20 	blr

x86_64:

    0000000000000000 <c>:
       0:	8b 07                	mov    (%rdi),%eax
       2:	c3                   	retq   

    0000000000000010 <d>:
      10:	8b 07                	mov    (%rdi),%eax
      12:	c3                   	retq   

 - R.