[openib-general] iWARP patch to remove x86 special memcpy optimizations

Fri Aug 12 08:23:55 PDT 2005

Thanks to Christoph...

This patch removes memcpy4 and memcpy8 that were optimized 
to the SSES instructions when writing data over the PCI bus.  We
may need to do something later to optimize performance depending
on how good the Linux mempcy optimizations are.

Index: cc_qp_common.c
===================================================================

--- cc_qp_common.c	(revision 3073)
+++ cc_qp_common.c	(working copy)
@@ -135,139 +135,6 @@
 
 
 /*
- * Function: cc_memcpy8
- *
- * Description: 
- * Just like memcpy, but does 16 and 8 bytes at a time.
- *
- * IN: 
- * dest		- ptr destination
- * src		- ptr source
- * len		- The len, in bytes
- *
- * OUT: none
- *
- * Return: none
- */
-void
-cc_memcpy8( u64 *dest, u64 *src, s32 len) -{ -#ifdef CCDEBUG
-	assert((len & 0x03) == 0);
-	assert(((s32)dest & 0x03) == 0);
-	assert(((s32)src & 0x03) == 0);
-#endif
-
-#if (defined(X86_32) || defined(X86_64))
-
-#define MINSIZE 16
-	/* unaligned data copy, 16 bytes at a time */
-	while(len >= MINSIZE) {
-	        /* printf("%p --> %p 16B unaligned copy,len=%d \n", src,
dest,len); */
-		asm volatile("movdqu 0(%1), %%xmm0\n"
\
-			     "movdqu %%xmm0, 0(%0)\n"
\
-			     :: "r"(dest), "r"(src) : "memory");
-		src += 2;
-		dest += 2;
-		len -= 16;
-	}
-
-	/* At this point, we'll have fewer than 16 bytes left.
-	 * But, we only allow 8 byte copies.  So, we do 8 byte copies
now.
-	 * If our len happens to be 4 or 12, we will copy 8 or 16 bytes,
-	 * respectively.  This is not a problem, since
-	 * all msg_sizes in all WR queues are padded up to 8 bytes
-	 * (see fw/clustercore/cc_qp.c, the function ccwr_qp_create()).
-	 */
-	while(len >= 0) {
-	        /* printf("%p --> %p 8B copy,len=%d \n", src, dest,len);
*/
-		asm volatile("movq 0(%1), %%xmm0\n"                   \
-			     "movq %%xmm0, 0(%0)\n"                   \
-			     :: "r"(dest), "r"(src) : "memory");
-		src += 1;
-		dest += 1;
-		len -= 8;
-	}
-
-#else
-	#error "You need to define your platform, or add optimized"
-	#error "cc_memcpy8 support for your platform."
-
-#endif /*(defined(X86_64) || defined(X86_32)) */
-
-}
-
-/*
- * Function: memcpy4
- *
- * Description: 
- * Just like memcpy, but assumes all args are 4 byte aligned already.
- *
- * IN: 
- * dest		- ptr destination
- * src		- ptr source
- * len		- The len, in bytes
- *
- * OUT: none
- *
- * Return: none
- */
-static __inline__ void
-memcpy4(u64 *dest, u64 *src, u32 len)
-{
-#ifdef __KERNEL__
-	unsigned long flags;
-#endif /* #ifdef __KERNEL__ */
-
-	u64 xmm_regs[16]; /* Reserve space for 8, though only use 1 now.
*/
-
-#ifdef CCDEBUG
-	ASSERT((len & 0x03) == 0);
-	ASSERT(((long)dest & 0x03) == 0);
-	ASSERT(((long)src & 0x03) == 0);
-#endif
-
-	/* We must save and restor xmm0.
-	 * Failure to do so messes up the application code.
-	 */
-	asm volatile("movdqu %%xmm0, 0(%0)\n" :: "r"(xmm_regs) :
"memory");
-	
-#ifdef __KERNEL__
-	/* Further, in the kernel version, we must disable local
interupts. 
-	 * This is because ISRs do not save & restore xmm0.  So, if
-	 * we are interrupted between the first movdqu and the second,
-	 * then xmm0 may be modified, and we will write garbage to the
adapter.
-	 */
-	local_irq_save(flags);
-#endif /* #ifdef __KERNEL__ */
-
-#define MINSIZE 16
-	/* unaligned data copy */
-	while(len >= MINSIZE) {
-		asm volatile("movdqu 0(%1), %%xmm0\n"		\
-			     "movdqu %%xmm0, 0(%0)\n"		\
-			     :: "r"(dest), "r"(src) : "memory");
-		src += 2;
-		dest += 2;
-		len -= 16;
-	}
-
-#ifdef __KERNEL__
-	/* Restore interrupts and registers */
-	local_irq_restore(flags);
-	asm volatile("movdqu 0(%0), %%xmm0\n" :: "r"(xmm_regs) :
"memory");
-#endif /* #ifdef __KERNEL__ */
-
-	while (len >= 4) {
-		*((u32 *)dest) = *((u32 *)src);
-		dest = (u64*)((unsigned long)dest + 4);
-		src = (u64*)((unsigned long)src + 4);
-		len -= 4;
-	}
-}
-
-
-/*
  * Function: qp_wr_post
  *
  * Description: 
@@ -308,7 +175,7 @@
 	/*
 	 * Copy the wr down to the adapter
 	 */
-	memcpy4((void *)msg, (void *)wr, size);
+	memcpy((void *)msg, (void *)wr, size);
 
 	cc_mq_produce(q);
 	return CC_OK;
Index: cc_mq_common.c
===================================================================
--- cc_mq_common.c	(revision 3073)
+++ cc_mq_common.c	(working copy)
@@ -17,8 +17,6 @@
 #include "cc_mq_common.h"
 #include "cc_common.h"
 
-extern void cc_memcpy8(u64 *, u64 *, s32);
-
 #define BUMP(q,p)         (p) = ((p)+1) % (q)->q_size
 #define BUMP_SHARED(q,p)  (p) = cpu_to_be16((be16_to_cpu(p)+1) %
(q)->q_size)