实战x86-64汇编（Linux）

Last Updated: 2023-11-23 15:21:37 Thursday

-- TOC --

Hello World
Working with the C Library
Calling Conventions for 64-bit C Code
Command Line Arguments
calc power
Floating Point Instructions
Data Sections
Recursion

本文主体内容来自：https://cs.lmu.edu/~ray/notes/gasexamples/，略有修改，带上我自己的学习笔记，以及用Intel语法的重写。有一些x64 register方面的基础知识，参考x86-64汇编基础。

Hello World

# -------------------------------------------------------------------------
# Writes "Hello, World" to the console using only system calls. Runs on 64-bit Linux only.
# To assemble and run:
#
#     gcc -c hello.s && ld hello.o && ./a.out
#
# or
#
#     gcc -nostdlib hello.s && ./a.out
# -------------------------------------------------------------------------

        .global _start

        .text
_start:
        # write(1, message, 13)
        mov     $1, %rax                # system call 1 is write
        mov     $1, %rdi                # file handle 1 is stdout
        mov     $message, %rsi          # address of string to output
        mov     $13, %rdx               # number of bytes
        syscall                         # invoke operating system to do the write

        # exit(0)
        mov     $60, %rax               # system call 60 is exit
        xor     %rdi, %rdi              # we want return code 0
        syscall                         # invoke operating system to exit
message:
        .ascii  "Hello, world\n"

#是注释。
rax存放system call number。
汇编器会根据mov后面register的位数，来确定自己的位数。（还有些x64汇编指令也具有这个特性）
syscall：system call，在32位的i386时代，用int $0x80中断进入系统调用，性能较差。不同的调用指令，system call number不一样，所使用的register也不一样！

root/arch/x86/entry/syscalls/syscall_32.tbl
root/arch/x86/entry/syscalls/syscall_64.tbl

call：library call
_start，这是进程真正的入口，用.global定义为全局符号。ld默认会搜索这个symbol作为入口，main在它里面调用。一般的C代码看不到这个入口，它在stdlib中。
message是一个local符号，它是代码的一部分，虽然只是只读数据。（代码和数据混在一起的情况，就是这样）
.ascii用来定义ASCII字符串。
gcc -nostdlib hello.s，如果不指定-nostdlib，会出现multiple definition of _start的错误，而且，此时ld会寻找main入口，显然stdlib中的_start被使用了。
If you are using a different operating system, such as OSX or Windows, the system call numbers and the registers used will likely be different. 不同的OS，system call的定义也不相同。

还可以直接使用as编译：

$ as hello.s -o hello.o
$ ld hello.o -o hello
$ ./hello
Hello, world

用Intel语法重写，有一处关键，就是在提取地址的时候，要加上offset关键词。同时，下面的代码，还增加了对len的处理，通过exit调用，返回len的值。

.intel_syntax noprefix

.data
msg:
    .ascii "Hello asm\n"  # 10 chars
    len = . - msg

.global _start
.text
_start:
    mov rax, 1  # write
    mov rdi, 1  # stdout
    # 也可以写成这样：  offset flat: msg
    mov rsi, offset msg  # offset is key
    mov rdx, 10
    syscall

    mov rax, 60   # exit
    mov rdi, len
    syscall

取msg的地址，必须要带上offset前缀！大小写都OK！
虽然是syscall，看起来calling convention还是一样的，dsdc89。
msg在.data段。

Working with the C Library

# -------------------------------------------------------------------------
# Writes "Hola, mundo" to the console using a C library. Runs on Linux or any other system
# that does not use underscores for symbols in its C library. To assemble and run:
#
#     gcc hola.s && ./a.out
# -------------------------------------------------------------------------

        .global main

        .text
main:                             # This is called by C library's startup code
        mov     $message, %rdi    # First integer (or pointer) parameter in %rdi
        call    puts              # puts(message)
        ret                       # Return to C library code
message:
        .asciz "Hola, mundo"      # asciz puts a 0 byte at the end

gcc编译C代码，代码中的符号没有修饰，does not use underscores for symbols in its C library。
.asciz，自动在字符串后面加上\0，结束字符串。
ret指令从栈中弹出返回地址给rip寄存器。（返回后由caller清栈，即对rsp最加法）。

按照calling convention的规定，在call之前，rsp必须16字节对齐，但是此例没有对齐，编译后还能正常运行。用Intel语法重写，增加保持16字节对齐：

.intel_syntax noprefix

.section .rodata
msg:
    .asciz "Hello IIT"

.text
.global main
main:
    sub  rsp, 8   # or push rbp
    mov  rdi, offset msg
    call puts
    add  rsp, 8   # or pop rbp
    ret

由于call会自动将返回地址压栈，在每个函数入口，RSP都是未16字节对齐状态。
.section .rodata，定义只读数据区。

Calling Conventions for 64-bit C Code

x86-64 Call Conventions

打印fibonacci数的汇编代码：

# ----------------------------------------------------------------------
# A 64-bit Linux application that writes the first 90 Fibonacci numbers.  It
# needs to be linked with a C library.
#
# Assemble and Link:
#     gcc fib.s
# --------------------------------------------------------------------

        .global main

        .text
main:
        push    %rbx                    # we have to save this since we use it

        mov     $90, %ecx               # ecx will countdown to 0
        xor     %rax, %rax              # rax will hold the current number
        xor     %rbx, %rbx              # rbx will hold the next number
        inc     %rbx                    # rbx is originally 1
print:
        # We need to call printf, but we are using eax, ebx, and ecx.  printf
        # may destroy eax and ecx so we will save these before the call and
        # restore them afterwards.

        push    %rax                    # caller-save register
        push    %rcx                    # caller-save register

        mov     $format, %rdi           # set 1st parameter (format)
        mov     %rax, %rsi              # set 2nd parameter (current_number)
        xor     %rax, %rax              # because printf is varargs

        # Stack is already aligned because we pushed three 8 byte registers
        call    printf                  # printf(format, current_number)

        pop     %rcx                    # restore caller-save register
        pop     %rax                    # restore caller-save register

        mov     %rax, %rdx              # save the current number
        mov     %rbx, %rax              # next number is now current
        add     %rdx, %rbx              # get the new next number
        dec     %ecx                    # count down
        jnz     print                   # if not done counting, do some more

        pop     %rbx                    # restore rbx before returning
        ret
format:
        .asciz  "%20ld\n"

按照ABI规则，callee使用了rbx，因此必须保存rbx原来的值，并在最后恢复。
push %rcx，这是为了让%rsp形成16bytes对齐，同时保存rcx，因为printf调用会破坏此寄存器的值。
push %rax，因为call之后的返回值会覆盖%rax的值，所以将原来的值保存在stack中，call之后再pop出来使用。（printf返回显示的字符数）
按照ABI规则，第1个参数%rdi，第2个参数%rsi......
在调用printf之前（参数少），push rcx，感觉像似个特例。从libc.a中提取printf.o，看看它的汇编，的确有一句xor ecx, ecx。

用Intel语法重写，使用不同的寄存器，通过判断CF作为结束条件，增加了按十六进制打印输出：

.intel_syntax noprefix

.section .rodata
fmt:
    .asciz "%4d:%24lu 0x%lX\n"

.text
.global main
main:
    push r12
    push r13
    push r14
    xor r12d, r12d
    mov r13d, 1
    mov r14d, 1
    # show zero
    mov rdi, offset fmt
    mov rsi, r14
    mov edx, 0
    mov ecx, 0
    xor eax, eax
    call printf
show:  # start from 1
    inc r14
    mov rdi, offset fmt
    mov rsi, r14
    mov rdx, r13
    mov rcx, rdx
    xor eax, eax  # no float param
    call printf
add:
    mov rdx, r12
    mov r12, r13
    add r13, rdx
    jc end   # check if carry
    jmp show
end:
    pop r14
    pop r13
    pop r12
    ret

mov直接使用rbx，生成7字节机器码。如果这个改成ebx，效果一样，但只有5字节机器码。（对通用寄存器的低32位赋值时，其高32位部分直接清0，在mov或xor时常用到）
callee function在使用前必须要保存的register是：bb12215，上述代码用到了r12，r13和r14。（尝试了r10和r11，这两个寄存器一样会被printf调用破坏，通过gdb跟踪代码查看寄存器可观察到）

Command Line Arguments

打印main的argv。

# -----------------------------------------------------------------------------
# A 64-bit program that displays its commandline arguments, one per line.
#
# On entry, %rdi will contain argc and %rsi will contain argv.
# -----------------------------------------------------------------------------

        .global main

        .text
main:
        push    %rdi                    # save registers that puts uses
        push    %rsi
        sub     $8, %rsp                # must align stack before call

        mov     (%rsi), %rdi            # the argument string to display
        call    puts                    # print it

        add     $8, %rsp                # restore %rsp to pre-aligned value
        pop     %rsi                    # restore registers puts used
        pop     %rdi

        add     $8, %rsi                # point to next argument
        dec     %rdi                    # count down
        jnz     main                    # if not done counting keep going

        ret

这个case，通过sub和add的方式，来满足%rsp的16bytes对齐的要求。
%rsi是第2个参数，这个参数就是argv，它是char**类型，因此(%rsi)才是指向参数字符串的开始地址。
main开始就要push %rdi，因为call puts的时候。
从main开始，Calling Convention规则就用上了。
汇编代码居然可以直接在main开始的地方循环。
add $8, %rsi，这是在内存中向高地址偏移。

用Intel语法改写，功能升级：字符串转数字的x64汇编实现

calc power

# -----------------------------------------------------------------------------
# A 64-bit command line application to compute x^y.
#
# Syntax: power x y
# x and y are integers
# -----------------------------------------------------------------------------

        .global main

        .text
main:
        push    %r12                    # save callee-save registers
        push    %r13
        push    %r14
        # By pushing 3 registers our stack is already aligned for calls

        cmp     $3, %rdi                # must have exactly two arguments
        jne     error1

        mov     %rsi, %r12              # argv

# We will use ecx to count down form the exponent to zero, esi to hold the
# value of the base, and eax to hold the running product.

        mov     16(%r12), %rdi          # argv[2]
        call    atoi                    # y in eax
        cmp     $0, %eax                # disallow negative exponents
        jl      error2
        mov     %eax, %r13d             # y in r13d

        mov     8(%r12), %rdi           # argv
        call    atoi                    # x in eax
        mov     %eax, %r14d             # x in r14d

        mov     $1, %eax                # start with answer = 1
check:
        test    %r13d, %r13d            # we're counting y downto 0
        jz      gotit                   # done
        imul    %r14d, %eax             # multiply in another x
        dec     %r13d
        jmp     check
gotit:                                  # print report on success
        mov     $answer, %rdi
        movslq  %eax, %rsi
        xor     %rax, %rax
        call    printf
        jmp     done
error1:                                 # print error message
        mov     $badArgumentCount, %edi
        call    puts
        jmp     done
error2:                                 # print error message
        mov     $negativeExponent, %edi
        call    puts
done:                                   # restore saved registers
        pop     %r14
        pop     %r13
        pop     %r12
        ret

answer:
        .asciz  "%d\n"
badArgumentCount:
        .asciz  "Requires exactly two arguments\n"
negativeExponent:
        .asciz  "The exponent may not be negative\n"

根据ABI规则，callee使用了r12，r13，r14，因此先对这三个register做push，最后pop。
%rdi对应main的argc，%rsi对应main的argv。
call atoi，命令行传进来的都是string。
用%eax来存放连续乘法的结果，会溢出的。（如何判断溢出？）
call printf之前的xor是否可以不需要？应该不行，它用来告诉variadic printf，没有浮点数参数。（自己写测试代码，用gcc编译后检查汇编，可以看到，有几个浮点数，eax就是几）

用Intel语法重写：

.intel_syntax noprefix


.section .rodata
param_err:
    .string "I needs 2 parameters."
exp_err:
    .string "exponent may not be negative."
answer:
    .string "the power is: %ld\n"


.global main
.text
main:
    push r12
    push r13
    push r14
    cmp  edi, 3
    jne  error1
    # rsi would crash after atoi
    mov  r12, rsi
    mov  rdi, [r12+16]
    call atoi
    cmp  eax, 0
    jl   error2
    mov  r13, rax  # exponent
    mov  rdi, [r12+8]
    call atoi
    mov  r14, rax  # base
    # start from 1
    mov  eax, 1
check:
    test r13, r13
    jz   gotit
    imul rax, r14
    dec  r13
    jmp  check
gotit:
    mov  rdi, offset answer
    mov  rsi, rax
    xor  eax, eax
    call printf
    jmp  done
error1:
    mov  rdi, offset param_err
    call puts
    jmp  done
error2:
    mov  rdi, offset exp_err
    call puts
done:
    pop  r14
    pop  r13
    pop  r12
    ret

编译和运行输出：

$ gcc pow.s -o pow
$ ./pow 2 63
the power is: -9223372036854775808

Floating Point Instructions

用汇编写个计算浮点数array的sum的接口，然后用C语言调用。现在浮点数计算，都是用SSE相关指令，使用xmm寄存器。

# -----------------------------------------------------------------------------
# A 64-bit function that returns the sum of the elements in a floating-point
# array. The function has prototype:
#
#   double sum(double[] array, unsigned length)
# -----------------------------------------------------------------------------

        .global sum
        .text
sum:
        xorpd   %xmm0, %xmm0            # initialize the sum to 0
        cmp     $0, %rsi                # special case for length = 0
        je      done
next:
        addsd   (%rdi), %xmm0           # add in the current array element
        add     $8, %rdi                # move to next array element
        dec     %rsi                    # count down
        jnz     next                    # if not done counting, continue
done:
        ret                             # return value already in xmm0

哪些非global的符号，就跟C代码中用static修饰function一样，或者就是一个可以goto的label。

Intel语法版本：

.intel_syntax noprefix

.global sum
.text
sum:
    xorpd xmm0, xmm0
    cmp rsi, 0
    jz done
next:
    addsd xmm0, [rdi]
    add rdi, 8
    dec rsi
    jnz next
done:
    ret

C代码：

#include <stdio.h>

double sum(double[], unsigned);

int main() {
    double test[] = {
        40.5, 26.7, 21.9, 1.5, -40.5, -23.4
    };
    printf("%20.7f\n", sum(test, 6));
    printf("%20.7f\n", sum(test, 2));
    printf("%20.7f\n", sum(test, 0));
    printf("%20.7f\n", sum(test, 3));
    return 0;
}

编译运行：

$ gcc callsum.c sum.s -o cs
$ ./cs
          26.7000000
          67.2000000
           0.0000000
          89.1000000

Data Sections

计算命令行上输入数的平均数。

# -----------------------------------------------------------------------------
# 64-bit program that treats all its command line arguments as integers and
# displays their average as a floating point number. This program uses a data
# section to store intermediate results, not that it has to, but only to
# illustrate how data sections are used.
# -----------------------------------------------------------------------------

        .globl  main

        .text
main:
        dec     %rdi                    # argc-1, since we don't count program name
        jz      nothingToAverage
        mov     %rdi, count             # save number of real arguments
accumulate:
        push    %rdi                    # save register across call to atoi
        push    %rsi
        mov     (%rsi,%rdi,8), %rdi     # argv[rdi]
        call    atoi                    # now rax has the int value of arg
        pop     %rsi                    # restore registers after atoi call
        pop     %rdi
        add     %rax, sum               # accumulate sum as we go
        dec     %rdi                    # count down
        jnz     accumulate              # more arguments?
average:
        cvtsi2sd sum, %xmm0
        cvtsi2sd count, %xmm1
        divsd   %xmm1, %xmm0            # xmm0 is sum/count
        mov     $format, %rdi           # 1st arg to printf
        mov     $1, %rax                # printf is varargs, there is 1 non-int argument

        sub     $8, %rsp                # align stack pointer
        call    printf                  # printf(format, sum/count)
        add     $8, %rsp                # restore stack pointer

        ret

nothingToAverage:
        mov     $error, %rdi
        xor     %rax, %rax
        call    printf
        ret

        .data
count:  .quad   0
sum:    .quad   0
format: .asciz  "%g\n"
error:  .asciz  "There are no command line arguments to average\n"

".byte"、".short"、".word"、".int"、".long"、".quad"，分别对应1,2,2,4,8,8字节。
format和error也“有幸”被定义在了.data区域。
.data区域的符号在汇编代码中被直接使用，不带任何前缀，除了字符串符号外。
mov (%rsi,%rdi,8), %rdi，%rsi是起始地址，%rdi是偏移，8表示单位长度，就像C代码中的指针+1，是移动sizeof(type)的长度。
cvtsi2sd，convert scalar int to scalar double。
call printf前后的sub和add必须要有，否则segmentation fault。

Intel语法重置版：

.intel_syntax noprefix


.section .rodata
nojob_msg:
    .string "nothing needs to be done."
avg_msg:
    .string "avg: %f\n"


.data
count: .quad 0
sum:   .quad 0


.global main
.text
main:
    push r12
    push r13
    push r14
    dec rdi
    jz nojob

    mov count, rdi
    mov r12, rdi
    mov r13, rsi
acc:
    mov rdi, [r13+r12*8]
    call atoi
    add sum, rax
    dec r12
    jnz acc

    cvtsi2sd xmm0, QWORD PTR [sum]
    cvtsi2sd xmm1, QWORD PTR [count]
    divsd xmm0, xmm1
    mov rdi, offset avg_msg
    mov rax, 1
    call printf
    jmp done
nojob:
    mov rdi, offset nojob_msg
    call puts
done:
    pop r14
    pop r13
    pop r12
    ret

运行效果：

$ gcc avg.s -o avg
$ ./avg 1 2 3 4 5
avg: 3.000000

Recursion

汇编也可以call自己。

# ----------------------------------------------------------------------------
# A 64-bit recursive implementation of the function
#
#     uint64_t factorial(unsigned n)
#
# implemented recursively
# ----------------------------------------------------------------------------

        .globl  factorial

        .text
factorial:
        cmp     $1, %rdi                # n <= 1?
        jnbe    L1                      # if not, go do a recursive call
        mov     $1, %rax                # otherwise return 1
        ret
L1:
        push    %rdi                    # save n on stack (also aligns %rsp!)
        dec     %rdi                    # n-1
        call    factorial               # factorial(n-1), result goes in %rax
        pop     %rdi                    # restore n
        imul    %rdi, %rax              # n * factorial(n-1), stored in %rax
        ret

Intel syntax version:

.intel_syntax noprefix

.global factorial
.text
factorial:
    cmp rdi, 1
    ja recur
    mov rax, 1
    ret
recur:
    push rdi
    dec rdi
    call factorial
    pop rdi
    imul rax, rdi
    ret

caller in C:

#include <stdio.h>
#include <inttypes.h>

uint64_t factorial(unsigned n);

int main() {
    for (unsigned i = 0; i < 10; i++) {
        printf("factorial(%2u) = %lu\n", i, factorial(i));
    }
}

运行效果：

$ gcc call_recur.c recur.s -o recur
$ ./recur
factorial( 0) = 1
factorial( 1) = 1
factorial( 2) = 2
factorial( 3) = 6
factorial( 4) = 24
factorial( 5) = 120
factorial( 6) = 720
factorial( 7) = 5040
factorial( 8) = 40320
factorial( 9) = 362880

本文链接：https://cs.pynote.net/hd/asm/202212093/

-- EOF --

-- MORE --