x64汇编学习(9)-- array

-- TOC --

测试用C代码如下:

#include <stdio.h>

void simple_memcpy_test(void *dest, const void *src, size_t len){
    unsigned long long d = (unsigned long long)dest;
    unsigned long long s = (unsigned long long)src;
    size_t left = len;

    while(left > 8){
        *(long long*)d = *(long long*)s;
        d += 8;
        s += 8;
        left -= 8;
    }
}

int main(){
    char a[640] = {'1','2','3'};
    char b[640] = {};
    simple_memcpy_test(b, a, 640);
    printf("%c %c %c\n", b[0], b[1], b[2]);
    return 0;
}

不开优化,汇编如下:

simple_memcpy_test:
        push    rbp
        mov     rbp, rsp
        mov     QWORD PTR [rbp-40], rdi
        mov     QWORD PTR [rbp-48], rsi
        mov     QWORD PTR [rbp-56], rdx
        mov     rax, QWORD PTR [rbp-40]
        mov     QWORD PTR [rbp-8], rax
        mov     rax, QWORD PTR [rbp-48]
        mov     QWORD PTR [rbp-16], rax
        mov     rax, QWORD PTR [rbp-56]
        mov     QWORD PTR [rbp-24], rax
        jmp     .L2
.L3:
        # [rbp-16]存的是指针,
        # 因此要[rdx]取出指针指向的值
        mov     rdx, QWORD PTR [rbp-16]
        mov     rax, QWORD PTR [rbp-8]
        mov     rdx, QWORD PTR [rdx]
        # *(long long*)d = *(long long*)s
        mov     QWORD PTR [rax], rdx
        #  这3行代码,加减的都是指针
        add     QWORD PTR [rbp-8], 8
        add     QWORD PTR [rbp-16], 8
        sub     QWORD PTR [rbp-24], 8
.L2:
        # left - 8
        cmp     QWORD PTR [rbp-24], 8
        # if left > 8, goto .L3
        ja      .L3
        # return
        nop
        nop
        pop     rbp
        ret
.LC0:
        .string "%c %c %c\n"
main:
        push    rbp
        mov     rbp, rsp
        # 640*2 = 1280
        # rsp在call之后,会被用到!
        sub     rsp, 1280
        # 直接给a的前8个index赋值,
        # 前3个index的值就有了
        mov     QWORD PTR [rbp-640], 3355185
        mov     QWORD PTR [rbp-632], 0
        lea     rdx, [rbp-624]
        # eax=0 --> rax=0
        mov     eax, 0
        # repeat number is 78, 78*8=624
        mov     ecx, 78
        mov     rdi, rdx
        # 将rax的值,存入rdi指向的地址,
        # 重复执行,直到ecx值为0,
        # 每次执行,ecx-1,rdi+8(DF=0),
        # 直到ecx==0
        rep stosq
        mov     QWORD PTR [rbp-1280], 0
        mov     QWORD PTR [rbp-1272], 0
        lea     rdx, [rbp-1264]
        mov     eax, 0
        mov     ecx, 78
        mov     rdi, rdx
        rep stosq
        # 取地址,开始传参,call
        lea     rcx, [rbp-640]
        lea     rax, [rbp-1280]
        mov     edx, 640
        mov     rsi, rcx
        mov     rdi, rax
        call    simple_memcpy_test
        movzx   eax, BYTE PTR [rbp-1278]
        movsx   ecx, al
        movzx   eax, BYTE PTR [rbp-1279]
        movsx   edx, al
        movzx   eax, BYTE PTR [rbp-1280]
        movsx   eax, al
        mov     esi, eax
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    printf
        mov     eax, 0
        leave
        ret

-O1

simple_memcpy_test:
        cmp     rdx, 8
        jbe     .L1
        # (640-9)>>3 = 78
        sub     rdx, 9
        shr     rdx, 3
        # rcx是个offset,79个8
        # 这个是个bug,left > 8,当left==8的时候,就结束了!
        # 因此,79个8!!
        lea     rcx, [8+rdx*8]
        # rax=0
        mov     eax, 0
.L3:
        # 直接用rax作为rdi和rsi的offset
        mov     rdx, QWORD PTR [rsi+rax]
        mov     QWORD PTR [rdi+rax], rdx
        add     rax, 8
        # 比较两个offset,不相等就继续
        cmp     rax, rcx
        jne     .L3
.L1:
        ret
.LC0:
        .string "%c %c %c\n"
main:
        # 1280 + 8
        sub     rsp, 1288
        mov     QWORD PTR [rsp+640], 3355185
        mov     QWORD PTR [rsp+648], 0
        lea     rdi, [rsp+656]
        mov     eax, 0
        mov     ecx, 78
        rep stosq
        mov     QWORD PTR [rsp], 0
        mov     QWORD PTR [rsp+8], 0
        lea     rdi, [rsp+16]
        mov     ecx, 78
        rep stosq
        mov     edx, 640
        lea     rsi, [rsp+640]
        mov     rdi, rsp
        call    simple_memcpy_test
        movsx   ecx, BYTE PTR [rsp+2]
        movsx   edx, BYTE PTR [rsp+1]
        movsx   esi, BYTE PTR [rsp]
        mov     edi, OFFSET FLAT:.LC0
        mov     eax, 0
        call    printf
        mov     eax, 0
        add     rsp, 1288
        ret

-O2

inline了,但copy的时候,还是8bytes。

simple_memcpy_test:
        cmp     rdx, 8
        jbe     .L1
        sub     rdx, 9
        xor     eax, eax
        shr     rdx, 3
        lea     rcx, [8+rdx*8]
.L3:
        mov     rdx, QWORD PTR [rsi+rax]
        mov     QWORD PTR [rdi+rax], rdx
        add     rax, 8
        cmp     rax, rcx
        jne     .L3
.L1:
        ret
.LC0:
        .string "%c %c %c\n"
main:
        sub     rsp, 1288
        xor     eax, eax
        pxor    xmm0, xmm0
        mov     ecx, 78
        lea     rdi, [rsp+16]
        mov     QWORD PTR [rsp], 3355185
        rep stosq
        lea     rdi, [rsp+656]
        mov     ecx, 78
        mov     QWORD PTR [rsp+8], 0
        rep stosq
        # 16bytes操作
        movaps  XMMWORD PTR [rsp+640], xmm0
.L7:
        mov     rdx, QWORD PTR [rsp+rax]
        mov     QWORD PTR [rsp+640+rax], rdx
        add     rax, 8
        cmp     rax, 632
        jne     .L7
        movsx   ecx, BYTE PTR [rsp+642]
        movsx   edx, BYTE PTR [rsp+641]
        mov     edi, OFFSET FLAT:.LC0
        xor     eax, eax
        movsx   esi, BYTE PTR [rsp+640]
        call    printf
        xor     eax, eax
        add     rsp, 1288
        ret

-O3

simple_memcpy_test:
        mov     rcx, rdi
        cmp     rdx, 8
        jbe     .L1
        sub     rdx, 9
        cmp     rdx, 23
        # 长度较小,goto .L3
        jbe     .L3
        lea     rax, [rsi+8]
        cmp     rdi, rax
        jne     .L14
.L3:
        shr     rdx, 3
        xor     eax, eax
        lea     rdi, [8+rdx*8]
.L6:
        mov     rdx, QWORD PTR [rsi+rax]
        mov     QWORD PTR [rcx+rax], rdx
        add     rax, 8
        cmp     rax, rdi
        jne     .L6
.L1:
        ret
.L14:
        shr     rdx, 3
        xor     eax, eax
        add     rdx, 1
        mov     rdi, rdx
        shr     rdi
        sal     rdi, 4
.L4:
        # 使用movdqu和movups,可以操作非16bytes对齐的地址
        movdqu  xmm0, XMMWORD PTR [rsi+rax]
        movups  XMMWORD PTR [rcx+rax], xmm0
        add     rax, 16
        cmp     rax, rdi
        jne     .L4
        mov     rax, rdx
        and     rax, -2
        sal     rax, 3
        add     rcx, rax
        add     rax, rsi
        and     edx, 1
        je      .L1
        mov     rax, QWORD PTR [rax]
        mov     QWORD PTR [rcx], rax
        ret
.LC0:
        .string "%c %c %c\n"
main:
        sub     rsp, 1304
        xor     eax, eax
        pxor    xmm0, xmm0
        mov     ecx, 78
        lea     rdi, [rsp+32]
        movaps  XMMWORD PTR [rsp+656], xmm0
        rep stosq
        mov     ecx, 78
        lea     rdi, [rsp+672]
        mov     QWORD PTR [rsp+16], 3355185
        rep stosq
        mov     QWORD PTR [rsp+24], 0
.L16:
        # 16bytes copy
        # 内存地址保持16bytes对齐
        movdqa  xmm1, XMMWORD PTR [rsp+16+rax]
        movaps  XMMWORD PTR [rsp+656+rax], xmm1
        add     rax, 16
        # 这行代码用来做什么?
        movaps  XMMWORD PTR [rsp], xmm1
        cmp     rax, 624
        jne     .L16
        mov     rax, QWORD PTR [rsp+640]
        movsx   ecx, BYTE PTR [rsp+658]
        mov     edi, OFFSET FLAT:.LC0
        movsx   edx, BYTE PTR [rsp+657]
        movsx   esi, BYTE PTR [rsp+656]
        mov     QWORD PTR [rsp+1280], rax
        xor     eax, eax
        call    printf
        xor     eax, eax
        add     rsp, 1304
        ret

本文链接:https://cs.pynote.net/hd/asm/202302123/

-- EOF --

-- MORE --