; ---------------------------------------------------------------------------------------- ; Writes "Hello, World" to the console using only system calls. Runs on 64-bit Linux only. ; To assemble and run: ; ; nasm -felf64 hello.asm && ld hello.o && ./a.out ; ----------------------------------------------------------------------------------------
global _start
section .text _start: mov rax, 1 ; system call for write mov rdi, 1 ; file handle 1is stdout mov rsi, message ; address of string to output mov rdx, 13 ; number of bytes syscall ; invoke operating system to do the write mov rax, 60 ; system call for exit xor rdi, rdi ; exit code 0 syscall ; invoke operating system to exit
section .data message: db "Hello, World", 10 ; note the newline at the end
; ---------------------------------------------------------------------------------------- ; Writes "Hello, World" to the console using only system calls. Runs on 64-bit macOS only. ; To assemble and run: ; ; nasm -fmacho64 hello.asm && ld hello.o && ./a.out ; ----------------------------------------------------------------------------------------
global start
section .text start: mov rax, 0x02000004 ; system call for write mov rdi, 1 ; file handle 1is stdout mov rsi, message ; address of string to output mov rdx, 13 ; number of bytes syscall ; invoke operating system to do the write mov rax, 0x02000001 ; system call for exit xor rdi, rdi ; exit code 0 syscall ; invoke operating system to exit
section .data message: db "Hello, World", 10 ; note the newline at the end
mov x, y ; x ← y and x, y ; x ← x and y or x, y x ← x or y xor x, y x ← x xor y add x, y x ← x + y sub x, y x ← x – y inc x x ← x + 1 dec x x ← x – 1 syscall ;Invoke an operating system routine db ;A pseudo-instruction that declares bytes that will be in memory when the program runs
[750] ; displacement only [rbp] ; base register only [rcx + rsi*4] ; base + index * scale [rbp + rdx] ; scale is1 [rbx - 8] ; displacement is-8 [rax + rdi*8 + 500] ; all four components [rbx + counter] ; uses the address of the variable 'counter'as the displacement
立即数(Immediate Operands)
这些可以用多种方式编写。以下是官方文档中的一些示例。
1 2 3 4 5 6 7 8 9 10 11
200 ; decimal 0200 ; still decimal - the leading 0 does not make it octal 0200d ; explicitly decimal - d suffix 0d200 ; also decimal - 0d prefex 0c8h ; hex - h suffix, but leading 0is required because c8h looks like a var 0xc8 ; hex - the classic 0x prefix 0hc8 ; hex - for some reason NASM likes 0h 310q ; octal - q suffix 0q310 ; octal - 0q prefix 11001000b ; binary - b suffix 0b1100_1000 ; binary - 0b prefix, and by the way, underscores are allowed
; ---------------------------------------------------------------------------------------- ; This is an OSX console program that writes a little triangle of asterisks to standard ; output. Runs on macOS only. ; ; nasm -fmacho64 triangle.asm && gcc hola.o && ./a.out ; ----------------------------------------------------------------------------------------
global start section .text start: mov rdx, output ; rdx holds address of next byte to write mov r8, 1 ; initial line length mov r9, 0 ; number of stars written on line so far line: mov byte [rdx], '*' ; write single star inc rdx ; advance pointer to next cell to write inc r9 ; "count" number so far on line cmp r9, r8 ; did we reach the number of stars forthis line? jne line ; not yet, keep writing on this line lineDone: mov byte [rdx], 10 ; write a new line char inc rdx ; and move pointer to where next char goes inc r8 ; next line will be one char longer mov r9, 0 ; reset count of stars written on this line cmp r8, maxlines ; wait, did we already finish the last line? jng line ; if not, begin writing this line done: mov rax, 0x02000004 ; system call for write mov rdi, 1 ; file handle 1is stdout mov rsi, output ; address of string to output mov rdx, dataSize ; number of bytes syscall ; invoke operating system to do the write mov rax, 0x02000001 ; system call for exit xor rdi, rdi ; exit code 0 syscall ; invoke operating system to exit
je jumps to a label if the previous comparison was equal. We also have jne (jump if not equal), jl (jump if less), jnl (jump if not less), jg (jump if greater), jng (jump if not greater), jle (jump if less or equal), jnle (jump if not less or equal), jge (jump if greater or equal), jnge (jump if not greater or equal), and many more.
equ is actually not a real instruction. It simply defines an abbreviation for the assembler itself to use. (This is a profound idea.)
The .bss section is for writable data.
使用 C 库
仅使用系统调用编写独立程序很酷,但很少见。我们想使用 C 库中的好东西。
还记得 C 语言中的执行是如何从 main 函数“开始”的吗?那是因为 C 库内部实际上有 _start 标签! _start 处的代码进行一些初始化,然后调用 main,然后进行一些清理,然后发出系统调用以退出。所以你只需要实现main。我们可以在组装中做到这一点!
如果你有 Linux,试试这个:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
; ---------------------------------------------------------------------------------------- ; Writes "Hola, mundo" to the console using a C library. Runs on Linux. ; ; nasm -felf64 hola.asm && gcc hola.o && ./a.out ; ----------------------------------------------------------------------------------------
global main extern puts
section .text main: ; This is called by the C library startup code mov rdi, message ; First integer (or pointer) argument in rdi call puts ; puts(message) ret ; Return from main back into C library wrapper message: db "Hola, mundo", 0 ; Note strings must be terminated with0in C
; ---------------------------------------------------------------------------------------- ; This is an macOS console program that writes "Hola, mundo" on one line and then exits. ; It uses puts from the C library. To assemble and run: ; ; nasm -fmacho64 hola.asm && gcc hola.o && ./a.out ; ----------------------------------------------------------------------------------------
global _main extern _puts
section .text _main: push rbx ; Call stack must be aligned lea rdi, [rel message] ; First argument is address of message call _puts ; puts(message) pop rbx ; Fix up stack before returning ret
section .data message: db "Hola, mundo", 0 ; C strings need a zero byte at the end
How did we know the argument to puts was supposed to go in RDI? Answer: there are a number of conventions that are followed regarding calls.
When writing code for 64-bit Linux that integrates with a C library, you must follow the calling conventions explained in the AMD64 ABI Reference. You can also get this information from Wikipedia. The most important points are:
From left to right, pass as many parameters as will fit in registers. The order in which registers are allocated, are:
For integers and pointers, rdi, rsi, rdx, rcx, r8, r9.
Additional parameters are pushed on the stack, right to left, and are to be removed by the caller after the call.
After the parameters are pushed, the call instruction is made, so when the called function gets control, the return address is at [rsp], the first memory parameter is at [rsp+8], etc.
The stack pointer rsp must be aligned to a 16-byte boundary before making a call. Fine, but the process of making a call pushes the return address (8 bytes) on the stack, so when a function gets control, rsp is not aligned. You have to make that extra space yourself, by pushing something or subtracting 8 from rsp.
The only registers that the called function is required to preserve (the calle-save registers) are: rbp, rbx, r12, r13, r14, r15. All others are free to be changed by the called function.
The callee is also supposed to save the control bits of the XMCSR and the x87 control word, but x87 instructions are rare in 64-bit code so you probably don’t have to worry about this.
Integers are returned in rax or rdx:rax, and floating point values are returned in xmm0 or xmm1:xmm0.
; ----------------------------------------------------------------------------- ; A 64-bit Linux application that writes the first 90 Fibonacci numbers. To ; assemble and run: ; ; nasm -felf64 fib.asm && gcc fib.o && ./a.out ; -----------------------------------------------------------------------------
global main extern printf
section .text main: push rbx ; we have to save this since we use it
mov ecx, 90 ; ecx will countdown to 0 xor rax, rax ; rax will hold the current number xor rbx, rbx ; rbx will hold the next number inc rbx ; rbx is originally 1 print: ; We need to call printf, but we are using rax, rbx, and rcx. printf ; may destroy rax and rcx so we will save these before the call and ; restore them afterwards.
mov rdi, format ; set1st parameter (format) mov rsi, rax ; set2nd parameter (current_number) xor rax, rax ; because printf is varargs
; Stack is already aligned because we pushed three 8 byte registers call printf ; printf(format, current_number)
pop rcx ; restore caller-save register pop rax ; restore caller-save register
mov rdx, rax ; save the current number mov rax, rbx ; next number is now current add rbx, rdx ; get the new next number dec ecx ; count down jnz print ; if not done counting, do some more
pop rbx ; restore rbx before returning ret format: db "%20ld", 10, 0
push x Decrement rsp by the size of the operand, then store x in [rsp]
pop x Move [rsp] into x, then increment rsp by the size of the operand
jnz label If the processor’s Z (zero) flag, is set, jump to the given label
call label Push the address of the next instruction, then jump to the label
ret Pop into the instruction pointer
混合 C 和汇编语言
这个程序只是一个简单的函数,它接受三个整数参数并返回最大值。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
; ----------------------------------------------------------------------------- ; A 64-bit function that returns the maximum value of its three 64-bit integer ; arguments. The function has signature: ; ; int64_t maxofthree(int64_t x, int64_t y, int64_t z) ; ; Note that the parameters have already been passed in rdi, rsi, and rdx. We ; just have to return the value in rax. ; -----------------------------------------------------------------------------
global maxofthree section .text maxofthree: mov rax, rdi ; result (rax) initially holds x cmp rax, rsi ; is x less than y? cmovl rax, rsi ; if so, set result to y cmp rax, rdx ; is max(x,y) less than z? cmovl rax, rdx ; if so, set result to z ret ; the max will be in rax
这是一个调用汇编语言函数的 C 程序。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/* * A small program that illustrates how to call the maxofthree function we wrote in * assembly language. */
After an arithmetic or logic instruction, or the compare instruction, cmp, the processor sets or clears bits in its rflags. The most interesting flags are:
s (sign)
z (zero)
c (carry)
o (overflow)
So after doing, say, an addition instruction, we can perform a jump, move, or set, based on the new flag settings. For example:
jz label Jump to label L if the result of the operation was zero
cmovno x, y x ← y if the last operation did not overflow
setc x x ← 1 if the last operation had a carry, but x ← 0 otherwise (x must be a byte-size register or memory location)
The conditional instructions have three base forms: j for conditional jump, cmov for conditional move, and set for conditional set. The suffix of the instruction has one of the 30 forms: s ns z nz c nc o no p np pe po e ne l nl le nle g ng ge nge a na ae nae b nb be nbe.
; ----------------------------------------------------------------------------- ; A 64-bit program that displays its command line arguments, one per line. ; ; On entry, rdi will contain argc and rsi will contain argv. ; -----------------------------------------------------------------------------
global main extern puts section .text main: push rdi ; save registers that puts uses push rsi sub rsp, 8 ; must align stack before call
mov rdi, [rsi] ; the argument string to display call puts ; print it
add rsp, 8 ; restore %rsp to pre-aligned value pop rsi ; restore registers puts used pop rdi
add rsi, 8 ; point to next argument dec rdi ; count down jnz main ; if not done counting keep going
ret
1 2 3 4 5 6
$ nasm -felf64 echo.asm && gcc echo.o && ./a.out dog 22 -zzz "hi there" ./a.out dog 22 -zzz hi there
一个更长的例子
请注意,就 C 库而言,命令行参数始终是字符串。如果你想将它们视为整数,请调用 atoi。这是一个计算 xy 的简洁程序。
; ----------------------------------------------------------------------------- ; A 64-bit command line application to compute x^y. ; ; Syntax: power x y ; x and y are (32-bit) integers ; -----------------------------------------------------------------------------
global main extern printf extern puts extern atoi
section .text main: push r12 ; save callee-save registers push r13 push r14 ; By pushing 3 registers our stack is already aligned for calls
cmp rdi, 3 ; must have exactly two arguments jne error1
mov r12, rsi ; argv
; We will use ecx to count down form the exponent to zero, esi to hold the ; value of the base, and eax to hold the running product.
mov rdi, [r12+16] ; argv[2] call atoi ; y in eax cmp eax, 0 ; disallow negative exponents jl error2 mov r13d, eax ; y in r13d
mov rdi, [r12+8] ; argv call atoi ; x in eax mov r14d, eax ; x in r14d
mov eax, 1 ; start with answer = 1 check: test r13d, r13d ; we're counting y downto 0 jz gotit ; done imul eax, r14d ; multiply in another x dec r13d jmp check gotit: ; print report on success mov rdi, answer movsxd rsi, eax xor rax, rax call printf jmp done error1: ; print error message mov edi, badArgumentCount call puts jmp done error2: ; print error message mov edi, negativeExponent call puts done: ; restore saved registers pop r14 pop r13 pop r12 ret answer: db "%d", 10, 0 badArgumentCount: db "Requires exactly two arguments", 10, 0 negativeExponent: db "The exponent may not be negative", 10, 0
1 2 3 4 5 6 7 8 9
$ nasm -felf64 power.asm && gcc -o power power.o $ ./power 2 19 524288 $ ./power 3 -8 The exponent may not be negative $ ./power 1 500 1 $ ./power 1 Requires exactly two arguments
; ----------------------------------------------------------------------------- ; A 64-bit function that returns the sum of the elements in a floating-point ; array. The function has prototype: ; ; double sum(double[] array, uint64_t length) ; -----------------------------------------------------------------------------
global sum section .text sum: xorpd xmm0, xmm0 ; initialize the sum to 0 cmp rsi, 0 ; special casefor length = 0 je done next: addsd xmm0, [rdi] ; add in the current array element add rdi, 8 ; move to next array element dec rsi ; count down jnz next ; if not done counting, continue done: ret ; return value already in xmm0
注意浮点指令有一个 sd 后缀;这是最常见的一种,但我们稍后会看到其他一些。这是一个调用它的 C 程序:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
/* * Illustrates how to call the sum function we wrote in assembly language. */
; ----------------------------------------------------------------------------- ; 64-bit program that treats all its command line arguments as integers and ; displays their average as a floating point number. This program uses a data ; section to store intermediate results, not that it has to, but only to ; illustrate how data sections are used. ; -----------------------------------------------------------------------------
global main extern atoi extern printf default rel
section .text main: dec rdi ; argc-1, since we don't count program name jz nothingToAverage mov [count], rdi ; save number of real arguments accumulate: push rdi ; save register across call to atoi push rsi mov rdi, [rsi+rdi*8] ; argv[rdi] call atoi ; now rax has the int value of arg pop rsi ; restore registers after atoi call pop rdi add [sum], rax ; accumulate sum as we go dec rdi ; count down jnz accumulate ; more arguments? average: cvtsi2sd xmm0, [sum] cvtsi2sd xmm1, [count] divsd xmm0, xmm1 ; xmm0 is sum/count mov rdi, format ; 1st arg to printf mov rax, 1 ; printf is varargs, there is 1 non-int argument sub rsp, 8 ; align stack pointer call printf ; printf(format, sum/count) add rsp, 8 ; restore stack pointer ret nothingToAverage: mov rdi, error xor rax, rax call printf ret section .data count: dq 0 sum: dq 0 format: db "%g", 10, 0 error: db "There are no command line arguments to average", 10, 0
1 2 3 4
$ nasm -felf64 average.asm && gcc average.o && ./a.out 19 8 21 -33 3.75 $ nasm -felf64 average.asm && gcc average.o && ./a.out There are no command line arguments to average
This program highlighted some processor instructions that convert between integers and floating point values. A few of the most common are:
; ---------------------------------------------------------------------------- ; An implementation of the recursive function: ; ; uint64_t factorial(uint64_t n) { ; return (n <= 1) ? 1 : n * factorial(n-1); ; } ; ----------------------------------------------------------------------------
global factorial
section .text factorial: cmp rdi, 1 ; n <= 1? jnbe L1 ; if not, go do a recursive call mov rax, 1 ; otherwise return1 ret L1: push rdi ; save n on stack (also aligns %rsp!) dec rdi ; n-1 call factorial ; factorial(n-1), result goes in %rax pop rdi ; restore n imul rax, rdi ; n * factorial(n-1), stored in %rax ret
一个示例调用者:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* * An application that illustrates calling the factorial function defined elsewhere. */
#include <stdio.h> #include <inttypes.h>
uint64_t factorial(uint64_t n);
int main() { for (uint64_t i = 0; i < 20; i++) { printf("factorial(%2lu) = %lu\n", i, factorial(i)); } return 0; }
The XMM registers can do arithmetic on floating point values one operation at a time (scalar) or multiple operations at a time (packed). The operations have the form:
op xmmreg_or_memory, xmmreg
For floating point addition, the instructions are:
addpd do 2 double-precision additions in parallel (add packed double)
addsd do just one double-precision addition, using the low 64-bits of the register (add scalar double)
addps do 4 single-precision additions in parallel (add packed single)
addss do just one single-precision addition, using the low 32-bits of the register (add scalar single)
Here’s a function that adds four floats at once:
1 2 3 4 5 6 7 8 9 10 11 12
; void add_four_floats(float x[4], float y[4]) ; x[i] += y[i] for i in range(0..4)
global add_four_floats section .text
add_four_floats: movdqa xmm0, [rdi] ; all four values of x movdqa xmm1, [rsi] ; all four values of y addps xmm0, xmm1 ; do all four sums in one shot movdqa [rdi], xmm0 ret
; ---------------------------------------------------------------------------------------- ; Example of signed saturated arithmetic. ; ----------------------------------------------------------------------------------------
global example section .text example: sub rsp, 24 mov qword [rsp+8], 7 mov rax, rdi imul rax, [rsp+8] add rax, rsi add rsp, 24 ret
在 macOS 上使用 NASM
Hopefully you’ve gone through the whole tutorial above using a Linux-based operating system (or perhaps more correctly, and ELF64 system). There are pretty much only five thing to know to get these examples working under a 64-bit macOS system:
This object file format is macho64, not elf64.
The system call numbers are totally different.
Symbols shared between modules will be prefixed by underscores.
It seems that the gcc linker in macOS doesn’t allow absolute addressing unless you tweak some settings. So add default rel when you are referencing labeled memory locations, and always use lea to get your addresses.
Also, it appears that sometimes under Linux, the 16-bit stack alignment requirement is not enforced, but it appears to be always enforced under macOS.
So here’s the average program from above, written for macOS.
; ----------------------------------------------------------------------------- ; 64-bit program that treats all its command line arguments as integers and ; displays their average as a floating point number. This program uses a data ; section to store intermediate results, not that it has to, but only to ; illustrate how data sections are used. ; ; Designed for OS X. To assemble and run: ; ; nasm -fmacho64 average.asm && gcc average.o && ./a.out ; -----------------------------------------------------------------------------
global _main extern _atoi extern _printf default rel
section .text _main: push rbx ; we don't ever use this, but it is necesary ; to align the stack so we can call stuff dec rdi ; argc-1, since we don't count program name jz nothingToAverage mov [count], rdi ; save number of real arguments accumulate: push rdi ; save register across call to atoi push rsi mov rdi, [rsi+rdi*8] ; argv[rdi] call _atoi ; now rax has the int value of arg pop rsi ; restore registers after atoi call pop rdi add [sum], rax ; accumulate sum as we go dec rdi ; count down jnz accumulate ; more arguments? average: cvtsi2sd xmm0, [sum] cvtsi2sd xmm1, [count] divsd xmm0, xmm1 ; xmm0 is sum/count lea rdi, [format] ; 1st arg to printf mov rax, 1 ; printf is varargs, there is1 non-int argument call _printf ; printf(format, sum/count) jmp done
done: pop rbx ; undoes the stupid push at the beginning ret
section .data count: dq 0 sum: dq 0 format: db "%g", 10, 0 error: db "There are no command line arguments to average", 10, 0
1 2 3 4 5 6 7
$ nasm -fmacho64 average.asm && gcc average.o && ./a.out There are no command line arguments to average $ nasm -fmacho64 average.asm && gcc average.o && ./a.out 54.3 54 $ nasm -fmacho64 average.asm && gcc average.o && ./a.out 54.3 -4 -3 -25 455.1111 95.4
Using NASM on Windows
I’m not sure what the system calls are on Windows, but I do know that if you want to assemble and link with the C library, you have to understand the x64 conventions. Read them. You will learn such things as:
The first four integer parameters are passed in RCX, RDX, R8, and R9. The rest are to be pushed on the stack.
The callee must preserve RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15.
The first four floating point parameters are passed in, you guessed it, XMM0, XMM1, XMM2, and XMM3.
Return values go in RAX or XMM0.
IMPORTANT: There’s one thing that’s really hard to find in any documentation: the x64 calling convention requires you to allocate 32 bytes of shadow space before each call, and remove it after your call. This means your “hello world” program looks like this:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
; ---------------------------------------------------------------------------------------- ; This is a Win64 console program that writes "Hello" on one line and then exits. It ; uses puts from the C library. To assemble and run: ; ; nasm -fwin64 hello.asm && gcc hello.obj && a ; ----------------------------------------------------------------------------------------
global main extern puts section .text main: sub rsp, 28h ; Reserve the shadow space mov rcx, message ; First argument is address of message call puts ; puts(message) add rsp, 28h ; Remove shadow space ret message: db 'Hello', 0 ; C strings need a zero byte at the end
Did you notice we actually reserved 40 bytes? Thirty-two bytes of shadow space is a minimum requirement. In our main function, we are calling another function, so our stack must be aligned on a 16-byte boundary. When main is called, the return address (8 bytes) was pushed, so we have to “add” an extra 8 bytes to the shadow space.