-- TOC --
测试用C代码如下:
#include <stdio.h>
void simple_memcpy_test(void *dest, const void *src, size_t len){
unsigned long long d = (unsigned long long)dest;
unsigned long long s = (unsigned long long)src;
size_t left = len;
while(left > 8){
*(long long*)d = *(long long*)s;
d += 8;
s += 8;
left -= 8;
}
}
int main(){
char a[640] = {'1','2','3'};
char b[640] = {};
simple_memcpy_test(b, a, 640);
printf("%c %c %c\n", b[0], b[1], b[2]);
return 0;
}
不开优化,汇编如下:
simple_memcpy_test:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-40], rdi
mov QWORD PTR [rbp-48], rsi
mov QWORD PTR [rbp-56], rdx
mov rax, QWORD PTR [rbp-40]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-48]
mov QWORD PTR [rbp-16], rax
mov rax, QWORD PTR [rbp-56]
mov QWORD PTR [rbp-24], rax
jmp .L2
.L3:
# [rbp-16]存的是指针,
# 因此要[rdx]取出指针指向的值
mov rdx, QWORD PTR [rbp-16]
mov rax, QWORD PTR [rbp-8]
mov rdx, QWORD PTR [rdx]
# *(long long*)d = *(long long*)s
mov QWORD PTR [rax], rdx
# 这3行代码,加减的都是指针
add QWORD PTR [rbp-8], 8
add QWORD PTR [rbp-16], 8
sub QWORD PTR [rbp-24], 8
.L2:
# left - 8
cmp QWORD PTR [rbp-24], 8
# if left > 8, goto .L3
ja .L3
# return
nop
nop
pop rbp
ret
.LC0:
.string "%c %c %c\n"
main:
push rbp
mov rbp, rsp
# 640*2 = 1280
# rsp在call之后,会被用到!
sub rsp, 1280
# 直接给a的前8个index赋值,
# 前3个index的值就有了
mov QWORD PTR [rbp-640], 3355185
mov QWORD PTR [rbp-632], 0
lea rdx, [rbp-624]
# eax=0 --> rax=0
mov eax, 0
# repeat number is 78, 78*8=624
mov ecx, 78
mov rdi, rdx
# 将rax的值,存入rdi指向的地址,
# 重复执行,直到ecx值为0,
# 每次执行,ecx-1,rdi+8(DF=0),
# 直到ecx==0
rep stosq
mov QWORD PTR [rbp-1280], 0
mov QWORD PTR [rbp-1272], 0
lea rdx, [rbp-1264]
mov eax, 0
mov ecx, 78
mov rdi, rdx
rep stosq
# 取地址,开始传参,call
lea rcx, [rbp-640]
lea rax, [rbp-1280]
mov edx, 640
mov rsi, rcx
mov rdi, rax
call simple_memcpy_test
movzx eax, BYTE PTR [rbp-1278]
movsx ecx, al
movzx eax, BYTE PTR [rbp-1279]
movsx edx, al
movzx eax, BYTE PTR [rbp-1280]
movsx eax, al
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
-O1
simple_memcpy_test:
cmp rdx, 8
jbe .L1
# (640-9)>>3 = 78
sub rdx, 9
shr rdx, 3
# rcx是个offset,79个8
# 这个是个bug,left > 8,当left==8的时候,就结束了!
# 因此,79个8!!
lea rcx, [8+rdx*8]
# rax=0
mov eax, 0
.L3:
# 直接用rax作为rdi和rsi的offset
mov rdx, QWORD PTR [rsi+rax]
mov QWORD PTR [rdi+rax], rdx
add rax, 8
# 比较两个offset,不相等就继续
cmp rax, rcx
jne .L3
.L1:
ret
.LC0:
.string "%c %c %c\n"
main:
# 1280 + 8
sub rsp, 1288
mov QWORD PTR [rsp+640], 3355185
mov QWORD PTR [rsp+648], 0
lea rdi, [rsp+656]
mov eax, 0
mov ecx, 78
rep stosq
mov QWORD PTR [rsp], 0
mov QWORD PTR [rsp+8], 0
lea rdi, [rsp+16]
mov ecx, 78
rep stosq
mov edx, 640
lea rsi, [rsp+640]
mov rdi, rsp
call simple_memcpy_test
movsx ecx, BYTE PTR [rsp+2]
movsx edx, BYTE PTR [rsp+1]
movsx esi, BYTE PTR [rsp]
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
add rsp, 1288
ret
-O2
inline了,但copy的时候,还是8bytes。
simple_memcpy_test:
cmp rdx, 8
jbe .L1
sub rdx, 9
xor eax, eax
shr rdx, 3
lea rcx, [8+rdx*8]
.L3:
mov rdx, QWORD PTR [rsi+rax]
mov QWORD PTR [rdi+rax], rdx
add rax, 8
cmp rax, rcx
jne .L3
.L1:
ret
.LC0:
.string "%c %c %c\n"
main:
sub rsp, 1288
xor eax, eax
pxor xmm0, xmm0
mov ecx, 78
lea rdi, [rsp+16]
mov QWORD PTR [rsp], 3355185
rep stosq
lea rdi, [rsp+656]
mov ecx, 78
mov QWORD PTR [rsp+8], 0
rep stosq
# 16bytes操作
movaps XMMWORD PTR [rsp+640], xmm0
.L7:
mov rdx, QWORD PTR [rsp+rax]
mov QWORD PTR [rsp+640+rax], rdx
add rax, 8
cmp rax, 632
jne .L7
movsx ecx, BYTE PTR [rsp+642]
movsx edx, BYTE PTR [rsp+641]
mov edi, OFFSET FLAT:.LC0
xor eax, eax
movsx esi, BYTE PTR [rsp+640]
call printf
xor eax, eax
add rsp, 1288
ret
-O3
simple_memcpy_test:
mov rcx, rdi
cmp rdx, 8
jbe .L1
sub rdx, 9
cmp rdx, 23
# 长度较小,goto .L3
jbe .L3
lea rax, [rsi+8]
cmp rdi, rax
jne .L14
.L3:
shr rdx, 3
xor eax, eax
lea rdi, [8+rdx*8]
.L6:
mov rdx, QWORD PTR [rsi+rax]
mov QWORD PTR [rcx+rax], rdx
add rax, 8
cmp rax, rdi
jne .L6
.L1:
ret
.L14:
shr rdx, 3
xor eax, eax
add rdx, 1
mov rdi, rdx
shr rdi
sal rdi, 4
.L4:
# 使用movdqu和movups,可以操作非16bytes对齐的地址
movdqu xmm0, XMMWORD PTR [rsi+rax]
movups XMMWORD PTR [rcx+rax], xmm0
add rax, 16
cmp rax, rdi
jne .L4
mov rax, rdx
and rax, -2
sal rax, 3
add rcx, rax
add rax, rsi
and edx, 1
je .L1
mov rax, QWORD PTR [rax]
mov QWORD PTR [rcx], rax
ret
.LC0:
.string "%c %c %c\n"
main:
sub rsp, 1304
xor eax, eax
pxor xmm0, xmm0
mov ecx, 78
lea rdi, [rsp+32]
movaps XMMWORD PTR [rsp+656], xmm0
rep stosq
mov ecx, 78
lea rdi, [rsp+672]
mov QWORD PTR [rsp+16], 3355185
rep stosq
mov QWORD PTR [rsp+24], 0
.L16:
# 16bytes copy
# 内存地址保持16bytes对齐
movdqa xmm1, XMMWORD PTR [rsp+16+rax]
movaps XMMWORD PTR [rsp+656+rax], xmm1
add rax, 16
# 这行代码用来做什么?
movaps XMMWORD PTR [rsp], xmm1
cmp rax, 624
jne .L16
mov rax, QWORD PTR [rsp+640]
movsx ecx, BYTE PTR [rsp+658]
mov edi, OFFSET FLAT:.LC0
movsx edx, BYTE PTR [rsp+657]
movsx esi, BYTE PTR [rsp+656]
mov QWORD PTR [rsp+1280], rax
xor eax, eax
call printf
xor eax, eax
add rsp, 1304
ret
本文链接:https://cs.pynote.net/hd/asm/202302123/
-- EOF --
-- MORE --