内存拷贝的优化方法(草稿) [2]
以下为引用: global_fast_memcpy9 %defineparamesp124 %definesrcparam0 %definedstparam4 %definelenparam8 %defineCACHEBLOCK400h _fast_memcpy9: pushesi pushedi pushebx movesi,[src];sourcearray movedi,[dst];destinationarray movecx,[len];numbero
以下为引用:
global _fast_memcpy9%define param esp+12+4
%define src param+0
%define dst param+4
%define len param+8%define CACHEBLOCK 400h
_fast_memcpy9:
push esi
push edi
push ebxmov esi, [src] ; source array
mov edi, [dst] ; destination array
mov ecx, [len] ; number of QWORDS (8 bytes) assumes len / CACHEBLOCK is an integer
shr ecx, 3lea esi, [esi+ecx*8] ; end of source
lea edi, [edi+ecx*8] ; end of destination
neg ecx ; use a negative offset as a combo pointer-and-loop-counter.mainloop:
mov eax, CACHEBLOCK / 16 ; note: .prefetchloop is unrolled 2X
add ecx, CACHEBLOCK ; move up to end of block.prefetchloop:
mov ebx, [esi+ecx*8-64] ; read one address in this cache line...
mov ebx, [esi+ecx*8-128] ; ... and one in the previous line
sub ecx, 16 ; 16 QWORDS = 2 64-byte cache lines
dec eax
jnz .prefetchloopmov eax, CACHEBLOCK / 8
.writeloop:
prefetchnta [esi+ecx*8 + 512] ; fetch ahead by 512 bytesmovq mm0, qword [esi+ecx*8]
movq mm1, qword [esi+ecx*8+8]
movq mm2, qword [esi+ecx*8+16]
movq mm3, qword [esi+ecx*8+24]
movq mm4, qword [esi+ecx*8+32]
movq mm5, qword [esi+ecx*8+40]
movq mm6, qword [esi+ecx*8+48]
movq mm7, qword [esi+ecx*8+56]movntq qword [edi+ecx*8], mm0
movntq qword [edi+ecx*8+8], mm1
movntq qword [edi+ecx*8+16], mm2
movntq qword [edi+ecx*8+24], mm3
movntq qword [edi+ecx*8+32], mm4
movntq qword [edi+ecx*8+40], mm5
movntq qword [edi+ecx*8+48], mm6
movntq qword [edi+ecx*8+56], mm7add ecx, 8
dec eax
jnz .writeloopor ecx, ecx ; assumes integer number of cacheblocks
jnz .mainloopsfence ; flush write buffer
emmspop ebx
pop edi
pop esiret