[openib-general] [PATCH] amso1100: kill memcpy4 and memcpy8

Thu Aug 11 08:04:15 PDT 2005

memcpy8 isn't used at all, and for memcpy4 we should just rely on the
compiler to do the right optimizations for us.


Signed-off-by: Christoph Hellwig <hch at lst.de>

Index: cc_qp_common.c
===================================================================

--- cc_qp_common.c	(revision 3058)
+++ cc_qp_common.c	(working copy)
@@ -135,139 +135,6 @@
 
 
 /*
- * Function: cc_memcpy8 
- *
- * Description: 
- * Just like memcpy, but does 16 and 8 bytes at a time.
- * 
- * IN: 
- * dest		- ptr destination
- * src		- ptr source
- * len		- The len, in bytes
- *
- * OUT: none
- *
- * Return: none
- */
-void
-cc_memcpy8( u64 *dest, u64 *src, s32 len)
-{
-#ifdef CCDEBUG
-	assert((len & 0x03) == 0);
-	assert(((s32)dest & 0x03) == 0);
-	assert(((s32)src & 0x03) == 0);
-#endif
-
-#if (defined(X86_32) || defined(X86_64))
- 
-#define MINSIZE 16
-	/* unaligned data copy, 16 bytes at a time */
-	while(len >= MINSIZE) {
-	        /* printf("%p --> %p 16B unaligned copy,len=%d \n", src, dest,len); */
-		asm volatile("movdqu 0(%1), %%xmm0\n"                   \
-			     "movdqu %%xmm0, 0(%0)\n"                   \
-			     :: "r"(dest), "r"(src) : "memory");
-		src += 2;
-		dest += 2;
-		len -= 16;
-	}
-
-	/* At this point, we'll have fewer than 16 bytes left.
-	 * But, we only allow 8 byte copies.  So, we do 8 byte copies now.
-	 * If our len happens to be 4 or 12, we will copy 8 or 16 bytes,
-	 * respectively.  This is not a problem, since
-	 * all msg_sizes in all WR queues are padded up to 8 bytes
-	 * (see fw/clustercore/cc_qp.c, the function ccwr_qp_create()).
-	 */
-	while(len >= 0) {
-	        /* printf("%p --> %p 8B copy,len=%d \n", src, dest,len); */
-		asm volatile("movq 0(%1), %%xmm0\n"                   \
-			     "movq %%xmm0, 0(%0)\n"                   \
-			     :: "r"(dest), "r"(src) : "memory");
-		src += 1;
-		dest += 1;
-		len -= 8;
-	}
-
-#elif
-	#error "You need to define your platform, or add optimized"
-	#error "cc_memcpy8 support for your platform."
-
-#endif /*(defined(X86_64) || defined(X86_32)) */
-
-} 
-
-/*
- * Function: memcpy4 
- *
- * Description: 
- * Just like memcpy, but assumes all args are 4 byte aligned already.
- *
- * IN: 
- * dest		- ptr destination
- * src		- ptr source
- * len		- The len, in bytes
- *
- * OUT: none
- *
- * Return: none
- */
-static __inline__ void
-memcpy4(u64 *dest, u64 *src, u32 len)
-{
-#ifdef __KERNEL__
-	unsigned long flags;
-#endif /* #ifdef __KERNEL__ */
-
-	u64 xmm_regs[16]; /* Reserve space for 8, though only use 1 now. */
-
-#ifdef CCDEBUG
-	ASSERT((len & 0x03) == 0);
-	ASSERT(((long)dest & 0x03) == 0);
-	ASSERT(((long)src & 0x03) == 0);
-#endif
-
-	/* We must save and restor xmm0.
-	 * Failure to do so messes up the application code.
-	 */
-	asm volatile("movdqu %%xmm0, 0(%0)\n" :: "r"(xmm_regs) : "memory");
-	
-#ifdef __KERNEL__
-	/* Further, in the kernel version, we must disable local interupts. 
-	 * This is because ISRs do not save & restore xmm0.  So, if
-	 * we are interrupted between the first movdqu and the second,
-	 * then xmm0 may be modified, and we will write garbage to the adapter.
-	 */
-	local_irq_save(flags);
-#endif /* #ifdef __KERNEL__ */
-
-#define MINSIZE 16
-	/* unaligned data copy */
-	while(len >= MINSIZE) {
-		asm volatile("movdqu 0(%1), %%xmm0\n"		\
-			     "movdqu %%xmm0, 0(%0)\n"		\
-			     :: "r"(dest), "r"(src) : "memory");
-		src += 2;
-		dest += 2;
-		len -= 16;
-	}
-
-#ifdef __KERNEL__
-	/* Restore interrupts and registers */
-	local_irq_restore(flags);
-	asm volatile("movdqu 0(%0), %%xmm0\n" :: "r"(xmm_regs) : "memory");
-#endif /* #ifdef __KERNEL__ */
-
-	while (len >= 4) {
-		*((u32 *)dest) = *((u32 *)src);
-		dest = (u64*)((unsigned long)dest + 4);
-		src = (u64*)((unsigned long)src + 4);
-		len -= 4;
-	}
-}
-
-
-/*
  * Function: qp_wr_post 
  *
  * Description: 
@@ -308,7 +175,7 @@
 	/*
 	 * Copy the wr down to the adapter
 	 */
-	memcpy4((void *)msg, (void *)wr, size);
+	memcpy(msg, wr, size);
 
 	cc_mq_produce(q);
 	return CC_OK;
Index: cc_mq_common.c
===================================================================
--- cc_mq_common.c	(revision 3058)
+++ cc_mq_common.c	(working copy)
@@ -17,8 +17,6 @@
 #include "cc_mq_common.h"
 #include "cc_common.h"
 
-extern void cc_memcpy8(u64 *, u64 *, s32);
-
 #define BUMP(q,p)         (p) = ((p)+1) % (q)->q_size
 #define BUMP_SHARED(q,p)  (p) = cpu_to_wr16((wr16_to_cpu(p)+1) % (q)->q_size)
 
Index: TODO
===================================================================
--- TODO	(revision 3058)
+++ TODO	(working copy)
@@ -43,15 +43,6 @@
 
 [-] cc_mq_common.c: BUMP is pretty inefficient, does a divide every time
 
-[-] cc_qp_common.c: cc_memcpy8 corrupts FPU state, is it really needed?
- it's never called. Why is it declared in cc_mq_common.c?
- memcpy4 similarly corrupts state. If it's fixed to save CR0 and do
- clts, is it really faster than a normal memcpy (considering it also
- disables IRQs)?
-
- This is all utterly non-portably anyway -- there needs to be a
- standard fallback for PPC64, IA64 etc.
-
 [-] Why is cc_queue.h needed? What is <linux/list.h> missing?
 
 [-] cc_types.h: get rid of NULL, TRUE, FALSE defines, cc_bool_t, etc.