欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  数据库

内存拷贝的优化方法(草稿) [2]

程序员文章站 2022-06-17 08:34:42
...

以下为引用: global_fast_memcpy9 %defineparamesp124 %definesrcparam0 %definedstparam4 %definelenparam8 %defineCACHEBLOCK400h _fast_memcpy9: pushesi pushedi pushebx movesi,[src];sourcearray movedi,[dst];destinationarray movecx,[len];numbero

以下为引用:

global _fast_memcpy9

%define param esp+12+4
%define src param+0
%define dst param+4
%define len param+8

%define CACHEBLOCK 400h

_fast_memcpy9:
push esi
push edi
push ebx

mov esi, [src] ; source array
mov edi, [dst] ; destination array
mov ecx, [len] ; number of QWORDS (8 bytes) assumes len / CACHEBLOCK is an integer
shr ecx, 3

lea esi, [esi+ecx*8] ; end of source
lea edi, [edi+ecx*8] ; end of destination
neg ecx ; use a negative offset as a combo pointer-and-loop-counter

.mainloop:
mov eax, CACHEBLOCK / 16 ; note: .prefetchloop is unrolled 2X
add ecx, CACHEBLOCK ; move up to end of block

.prefetchloop:
mov ebx, [esi+ecx*8-64] ; read one address in this cache line...
mov ebx, [esi+ecx*8-128] ; ... and one in the previous line
sub ecx, 16 ; 16 QWORDS = 2 64-byte cache lines
dec eax
jnz .prefetchloop

mov eax, CACHEBLOCK / 8

.writeloop:
prefetchnta [esi+ecx*8 + 512] ; fetch ahead by 512 bytes

movq mm0, qword [esi+ecx*8]
movq mm1, qword [esi+ecx*8+8]
movq mm2, qword [esi+ecx*8+16]
movq mm3, qword [esi+ecx*8+24]
movq mm4, qword [esi+ecx*8+32]
movq mm5, qword [esi+ecx*8+40]
movq mm6, qword [esi+ecx*8+48]
movq mm7, qword [esi+ecx*8+56]

movntq qword [edi+ecx*8], mm0
movntq qword [edi+ecx*8+8], mm1
movntq qword [edi+ecx*8+16], mm2
movntq qword [edi+ecx*8+24], mm3
movntq qword [edi+ecx*8+32], mm4
movntq qword [edi+ecx*8+40], mm5
movntq qword [edi+ecx*8+48], mm6
movntq qword [edi+ecx*8+56], mm7

add ecx, 8
dec eax
jnz .writeloop

or ecx, ecx ; assumes integer number of cacheblocks
jnz .mainloop

sfence ; flush write buffer
emms

pop ebx
pop edi
pop esi

ret