[openib-general] [PATCH 1 of 2] Add memcpy_cachebypass, a memcpy that tries to reduce cache pressure
Bryan O'Sullivan
bos at pathscale.com
Wed Nov 29 14:35:08 PST 2006
This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.
One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any
work with it. Since the source data is read-once, write-never from the
CPU's perspective, caching the data at those addresses can only evict
potentially useful data.
We provide an x86_64 implementation that uses SSE non-temporal loads,
and a generic version that falls back to plain memcpy.
Implementors for other arches should not use cache-bypassing stores to
the destination, as in most cases, the destination is accessed almost
immediately after a copy finishes.
diff -r c76ed2f1387b -r 3300b7b66f46 arch/x86_64/lib/Makefile
--- a/arch/x86_64/lib/Makefile Wed Nov 29 13:28:14 2006 +0800
+++ b/arch/x86_64/lib/Makefile Wed Nov 29 15:34:11 2006 -0800
@@ -9,4 +9,5 @@ lib-y := csum-partial.o csum-copy.o csum
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o \
+ memcpy_cachebypass.o
diff -r c76ed2f1387b -r 3300b7b66f46 arch/x86_64/lib/memcpy_cachebypass.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/arch/x86_64/lib/memcpy_cachebypass.S Wed Nov 29 15:34:11 2006 -0800
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_cachebypass - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+ .text
+ .p2align 4,,15
+ /* rdi destination, rsi source, rdx count */
+ .globl memcpy_cachebypass
+ .type memcpy_cachebypass, @function
+memcpy_cachebypass:
+ movq %rdi, %rax
+.L5:
+ cmpq $15, %rdx
+ ja .L34
+.L3:
+ cmpl $8, %edx /* rdx is 0..15 */
+ jbe .L9
+.L6:
+ testb $8, %dxl /* rdx is 3,5,6,7,9..15 */
+ je .L13
+ movq (%rsi), %rcx
+ addq $8, %rsi
+ movq %rcx, (%rdi)
+ addq $8, %rdi
+.L13:
+ testb $4, %dxl
+ je .L15
+ movl (%rsi), %ecx
+ addq $4, %rsi
+ movl %ecx, (%rdi)
+ addq $4, %rdi
+.L15:
+ testb $2, %dxl
+ je .L17
+ movzwl (%rsi), %ecx
+ addq $2, %rsi
+ movw %cx, (%rdi)
+ addq $2, %rdi
+.L17:
+ testb $1, %dxl
+ je .L33
+.L1:
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+.L33:
+ ret
+.L34:
+ cmpq $63, %rdx /* rdx is > 15 */
+ ja .L64
+ movl $16, %ecx /* rdx is 16..63 */
+.L25:
+ movq 8(%rsi), %r8
+ movq (%rsi), %r9
+ addq %rcx, %rsi
+ movq %r8, 8(%rdi)
+ movq %r9, (%rdi)
+ addq %rcx, %rdi
+ subq %rcx, %rdx
+ cmpl %edx, %ecx /* is rdx >= 16? */
+ jbe .L25
+ jmp .L3 /* rdx is 0..15 */
+ .p2align 4,,7
+.L64:
+ movl $64, %ecx
+.L42:
+ prefetchnta 128(%rsi)
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq %rcx, %rdx
+ movq %r8, (%rdi)
+ movq 32(%rsi), %r8
+ movq %r9, 8(%rdi)
+ movq 40(%rsi), %r9
+ movq %r10, 16(%rdi)
+ movq 48(%rsi), %r10
+ movq %r11, 24(%rdi)
+ movq 56(%rsi), %r11
+ addq %rcx, %rsi
+ movq %r8, 32(%rdi)
+ movq %r9, 40(%rdi)
+ movq %r10, 48(%rdi)
+ movq %r11, 56(%rdi)
+ addq %rcx, %rdi
+ cmpq %rdx, %rcx /* is rdx >= 64? */
+ jbe .L42
+ sfence
+ orl %edx, %edx
+ je .L33
+ jmp .L5
+.L9:
+ jmp *.L12(,%rdx,8) /* rdx is 0..8 */
+ .section .rodata
+ .align 8
+ .align 4
+.L12:
+ .quad .L33
+ .quad .L1
+ .quad .L2
+ .quad .L6
+ .quad .L4
+ .quad .L6
+ .quad .L6
+ .quad .L6
+ .quad .L8
+ .text
+.L2:
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+ ret
+.L4:
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ ret
+.L8:
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ ret
diff -r c76ed2f1387b -r 3300b7b66f46 include/asm-x86_64/string.h
--- a/include/asm-x86_64/string.h Wed Nov 29 13:28:14 2006 +0800
+++ b/include/asm-x86_64/string.h Wed Nov 29 15:34:11 2006 -0800
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
__ret = __builtin_memcpy((dst),(src),__len); \
__ret; })
+#define __HAVE_ARCH_MEMCPY_CACHEBYPASS
+extern void *memcpy_cachebypass(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
diff -r c76ed2f1387b -r 3300b7b66f46 include/linux/string.h
--- a/include/linux/string.h Wed Nov 29 13:28:14 2006 +0800
+++ b/include/linux/string.h Wed Nov 29 15:34:11 2006 -0800
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
+#ifndef __HAVE_ARCH_MEMCPY_CACHEBYPASS
+#define memcpy_cachebypass(dest, src, count) memcpy((dest), (src), (count))
+#endif
#ifndef __HAVE_ARCH_MEMMOVE
extern void * memmove(void *,const void *,__kernel_size_t);
#endif
More information about the general
mailing list