[openib-general] [PATCH] amso1100: kill memcpy4 and memcpy8
Christoph Hellwig
hch at lst.de
Thu Aug 11 08:04:15 PDT 2005
memcpy8 isn't used at all, and for memcpy4 we should just rely on the
compiler to do the right optimizations for us.
Signed-off-by: Christoph Hellwig <hch at lst.de>
Index: cc_qp_common.c
===================================================================
--- cc_qp_common.c (revision 3058)
+++ cc_qp_common.c (working copy)
@@ -135,139 +135,6 @@
/*
- * Function: cc_memcpy8
- *
- * Description:
- * Just like memcpy, but does 16 and 8 bytes at a time.
- *
- * IN:
- * dest - ptr destination
- * src - ptr source
- * len - The len, in bytes
- *
- * OUT: none
- *
- * Return: none
- */
-void
-cc_memcpy8( u64 *dest, u64 *src, s32 len)
-{
-#ifdef CCDEBUG
- assert((len & 0x03) == 0);
- assert(((s32)dest & 0x03) == 0);
- assert(((s32)src & 0x03) == 0);
-#endif
-
-#if (defined(X86_32) || defined(X86_64))
-
-#define MINSIZE 16
- /* unaligned data copy, 16 bytes at a time */
- while(len >= MINSIZE) {
- /* printf("%p --> %p 16B unaligned copy,len=%d \n", src, dest,len); */
- asm volatile("movdqu 0(%1), %%xmm0\n" \
- "movdqu %%xmm0, 0(%0)\n" \
- :: "r"(dest), "r"(src) : "memory");
- src += 2;
- dest += 2;
- len -= 16;
- }
-
- /* At this point, we'll have fewer than 16 bytes left.
- * But, we only allow 8 byte copies. So, we do 8 byte copies now.
- * If our len happens to be 4 or 12, we will copy 8 or 16 bytes,
- * respectively. This is not a problem, since
- * all msg_sizes in all WR queues are padded up to 8 bytes
- * (see fw/clustercore/cc_qp.c, the function ccwr_qp_create()).
- */
- while(len >= 0) {
- /* printf("%p --> %p 8B copy,len=%d \n", src, dest,len); */
- asm volatile("movq 0(%1), %%xmm0\n" \
- "movq %%xmm0, 0(%0)\n" \
- :: "r"(dest), "r"(src) : "memory");
- src += 1;
- dest += 1;
- len -= 8;
- }
-
-#elif
- #error "You need to define your platform, or add optimized"
- #error "cc_memcpy8 support for your platform."
-
-#endif /*(defined(X86_64) || defined(X86_32)) */
-
-}
-
-/*
- * Function: memcpy4
- *
- * Description:
- * Just like memcpy, but assumes all args are 4 byte aligned already.
- *
- * IN:
- * dest - ptr destination
- * src - ptr source
- * len - The len, in bytes
- *
- * OUT: none
- *
- * Return: none
- */
-static __inline__ void
-memcpy4(u64 *dest, u64 *src, u32 len)
-{
-#ifdef __KERNEL__
- unsigned long flags;
-#endif /* #ifdef __KERNEL__ */
-
- u64 xmm_regs[16]; /* Reserve space for 8, though only use 1 now. */
-
-#ifdef CCDEBUG
- ASSERT((len & 0x03) == 0);
- ASSERT(((long)dest & 0x03) == 0);
- ASSERT(((long)src & 0x03) == 0);
-#endif
-
- /* We must save and restor xmm0.
- * Failure to do so messes up the application code.
- */
- asm volatile("movdqu %%xmm0, 0(%0)\n" :: "r"(xmm_regs) : "memory");
-
-#ifdef __KERNEL__
- /* Further, in the kernel version, we must disable local interupts.
- * This is because ISRs do not save & restore xmm0. So, if
- * we are interrupted between the first movdqu and the second,
- * then xmm0 may be modified, and we will write garbage to the adapter.
- */
- local_irq_save(flags);
-#endif /* #ifdef __KERNEL__ */
-
-#define MINSIZE 16
- /* unaligned data copy */
- while(len >= MINSIZE) {
- asm volatile("movdqu 0(%1), %%xmm0\n" \
- "movdqu %%xmm0, 0(%0)\n" \
- :: "r"(dest), "r"(src) : "memory");
- src += 2;
- dest += 2;
- len -= 16;
- }
-
-#ifdef __KERNEL__
- /* Restore interrupts and registers */
- local_irq_restore(flags);
- asm volatile("movdqu 0(%0), %%xmm0\n" :: "r"(xmm_regs) : "memory");
-#endif /* #ifdef __KERNEL__ */
-
- while (len >= 4) {
- *((u32 *)dest) = *((u32 *)src);
- dest = (u64*)((unsigned long)dest + 4);
- src = (u64*)((unsigned long)src + 4);
- len -= 4;
- }
-}
-
-
-/*
* Function: qp_wr_post
*
* Description:
@@ -308,7 +175,7 @@
/*
* Copy the wr down to the adapter
*/
- memcpy4((void *)msg, (void *)wr, size);
+ memcpy(msg, wr, size);
cc_mq_produce(q);
return CC_OK;
Index: cc_mq_common.c
===================================================================
--- cc_mq_common.c (revision 3058)
+++ cc_mq_common.c (working copy)
@@ -17,8 +17,6 @@
#include "cc_mq_common.h"
#include "cc_common.h"
-extern void cc_memcpy8(u64 *, u64 *, s32);
-
#define BUMP(q,p) (p) = ((p)+1) % (q)->q_size
#define BUMP_SHARED(q,p) (p) = cpu_to_wr16((wr16_to_cpu(p)+1) % (q)->q_size)
Index: TODO
===================================================================
--- TODO (revision 3058)
+++ TODO (working copy)
@@ -43,15 +43,6 @@
[-] cc_mq_common.c: BUMP is pretty inefficient, does a divide every time
-[-] cc_qp_common.c: cc_memcpy8 corrupts FPU state, is it really needed?
- it's never called. Why is it declared in cc_mq_common.c?
- memcpy4 similarly corrupts state. If it's fixed to save CR0 and do
- clts, is it really faster than a normal memcpy (considering it also
- disables IRQs)?
-
- This is all utterly non-portably anyway -- there needs to be a
- standard fallback for PPC64, IA64 etc.
-
[-] Why is cc_queue.h needed? What is <linux/list.h> missing?
[-] cc_types.h: get rid of NULL, TRUE, FALSE defines, cc_bool_t, etc.
More information about the general
mailing list