Split from assembly-cheat

cirosantilli · cirosantilli · commit 5ec066042d25 · 2015-09-04T17:26:11.000+02:00
diff --git a/Makefile b/Makefile
@@ -0,0 +1,31 @@
+.POSIX:
+
+BIN_EXT ?= .bin
+IN_EXT ?= .asm
+OBJ_EXT ?= .o
+OUT_EXT ?= .hd
+
+INS := $(wildcard *$(IN_EXT))
+OUTS := $(patsubst %$(IN_EXT),%$(OUT_EXT),$(INS))
+
+.PHONY: all clean run
+.PRECIOUS: %$(BIN_EXT) %$(OBJ_EXT)
+
+all: $(OUTS)
+
+%$(OUT_EXT): %$(BIN_EXT)
+	od -An -tx1 '$<' | tail -c+2 > '$@'
+
+%$(BIN_EXT): %$(OBJ_EXT)
+	objcopy -O binary --only-section=.text '$<' '$@'
+
+%$(OBJ_EXT): %$(IN_EXT)
+	nasm -f elf32 -o '$@' '$<'
+	@# For raw 16 bit. Would need to remove the objcopy step.
+	@#nasm -f bin -o '$@' '$<'
+
+clean:
+	rm -f *$(BIN_EXT) *$(OBJ_EXT) *$(OUT_EXT)
+
+run: all
+	tail -n+1 *$(OUT_EXT)
diff --git a/README.md b/README.md
@@ -0,0 +1,11 @@
+# x86 Instruction Encoding Tutorial
+
+1.  [Introduction](introduction.md)
+1.  [Global structure](global-structure.md)
+1.  [Intel manual format](intel-manual-format.md)
+1.  Examples
+    1. [nop](nop.md)
+    1. [push ebp](push-ebp.md)
+    1. [mov eax, 1](mov-eax-1.md)
+    1. [mov eax, ebx](mov-eax-ebx.md)
+1.  [Bibliography](bibliography.md)
diff --git a/add-eax-ebx.md b/add-eax-ebx.md
@@ -0,0 +1,40 @@
+# add eax, ebx
+
+Output:
+
+    01 d8
+    ^^ ^^
+
+1. Opcode
+1. ModR/M
+
+Opcode bits:
+
+    0 0 0 0 0 0 0 1
+    ^^^^^^^^^^^ ^ ^
+    1           2 3
+
+1. This is an add.
+2. Add REG to R/M as represented on the ModR/M byte. Otherwise, other way around.
+3. 32-bit operands. Otherwise, 8-bit.
+
+ModR/M bits:
+
+    1 1 0 1 1 0 0 0
+    ^^^ ^^^^^ ^^^^^
+    1   2     3
+
+1. MOD = 3: REG and R/M are registers.
+2. REG = 3: EBX
+3. REG = 0: EAX
+
+So from the opcode, we move REG (EBX) into R/M (EAX).
+
+Note that two encodings are possible on reg / reg operations: we could swap the before last bit to 1 and both registers.
+
+Both possible encodings are documented on the instruction table:
+
+    01 /r    ADD r/m32, r32
+    03 /r    ADD r32, r/m32
+
+`/r` says that a MOdR/M follows the opcode, and that the 2 last bits describe it.
diff --git a/bibliography.md b/bibliography.md
@@ -0,0 +1,14 @@
+# Bibliography
+
+-   Intel® 64 and IA-32 Architectures Software Developer’s Manua
+
+    - section 2.1: binary serialization
+    - section 3.1: documentation format
+
+-   <http://www.c-jump.com/CIS77/CPU/x86/lecture.html>
+
+-   <http://www.codeproject.com/Articles/662301/x-Instruction-Encoding-Revealed-Bit-Twiddling-fo>
+
+-   <http://wiki.osdev.org/X86-64_Instruction_Encoding>
+
+-   <http://www.strchr.com/machine_code_redundancy>
diff --git a/global-structure.md b/global-structure.md
@@ -0,0 +1,56 @@
+# Global structure
+
+Legend: `X-Y: description`, where `X` is the minimum, and `Y` is the maximum number of bytes.
+
+- 0-4: instruction prefixes
+- 1-4: opcode
+- 0-1: ModR/M
+- 0-1: SIB
+- 0-4: displacement
+- 0-4: immediate
+
+The most interesting bytes to start learning are the opcode and ModR/M.
+
+## Opcode
+
+Says which instruction is being run.
+
+Sometimes, this can be further decomposed into smaller parts which say what is the source of data. E.g. [push ebp](push-ebp.asm), documented in the manual as `+rd`.
+
+## ModR/M
+
+Says where data is being moved to. Bits:
+
+    0 1 2 3 4 5 6 7
+    ^^^ ^^^^^ ^^^^^
+    1   2     3
+
+1.  MOD
+
+    Determines how the next fields are interpreted.
+
+    - 00: Indirect addressing mode.
+    - 01: Same as 00 but a 8-bit displacement is added to the value before dereferencing.
+    - 10: same as 01 but a 32-bit displacement is added to the value.
+    - 11: Reg and R/M byte will each refer to a register.
+
+2.  REG
+
+    - 000 (0): EAX (AX if data size is 16 bits, AL if data size is 8 bits)
+    - 001 (1): ECX/CX/CL
+    - 010 (2): EDX/DX/DL
+    - 011 (3): EBX/BX/BL
+    - 100 (4): ESP/SP (AH if data size is defined as 8 bits)
+    - 101 (5): EBP/BP (CH if data size is defined as 8 bits)
+    - 110 (6): ESI/SI (DH if data size is defined as 8 bits)
+    - 111 (7): EDI/DI (BH if data size is defined as 8 bits)
+
+3.  R/M
+
+## Prefixes
+
+### 66
+
+If given while on 16 bit mode, treat the memory as 32 bit.
+
+If given while on 32 bit mode, treat the memory as 16 bit.
diff --git a/intel-manual-format.md b/intel-manual-format.md
@@ -0,0 +1,58 @@
+# Intel manual format
+
+How the Intel manual documents the instruction encodings.
+
+- Opcode
+- Instruction
+- Op / En
+- 64-Bit Mode
+- Compat / Leg Mode
+- `CPUID` feature flag
+- Description
+
+They are explained in section 3.1.
+
+### Instruction
+
+E.g.:
+
+    XCHG EAX, r32
+
+Means: takes 2 arguments:
+
+- `EAX`: TODO
+- `r32`: a 32-bit register
+
+Other important values:
+
+- `r/m32`: either a 32-bit register or RAM Memory
+- `imm32`: value directly encoded on memory
+
+### Op/En
+
+### Operand Encoding
+
+Refers to an entry on the "Instruction Operand Encoding" table.
+
+Every instruction has it's own "Instruction Operand Encoding" table.
+
+TODO understand an operand encoding table, e.g. for `mov`.
+
+### CPUID feature flag
+
+Which version of CPU support the feature as reported by CPUID.
+
+### Compat / Leg Mode
+
+- valid
+- invalid: can be encoded, but generates an exception
+- N.E.: not encodable
+
+### 64-bit mode
+
+- V: Supported.
+- I: Not supported.
+- N.E.: instruction syntax is not encodable in 64-bit mode (it may represent part of a sequence of valid instructions in other modes).
+- N.P.: REX prefix does not affect the legacy instruction in 64-bit mode.
+- N.I.: opcode is treated as a new instruction in 64-bit mode.
+- N.S.: requires an address override prefix in 64-bit mode and is not supported. Using an address override prefix in 64-bit mode may result in model-specific execution behavior
diff --git a/introduction.md b/introduction.md
@@ -0,0 +1,14 @@
+# Introduction
+
+Convert all assembly inputs `.asm` into decompiled hexdump `.hd`:
+
+    sudo apt-get install nasm
+    make run
+
+To learn, rotate quickly between:
+
+- the examples
+- the general instruction organization
+- the Intel manual
+
+Until your brain starts to absorb them.
diff --git a/mov-al-1.asm b/mov-al-1.asm
@@ -0,0 +1 @@
+mov al, 1
diff --git a/mov-ax-1.asm b/mov-ax-1.asm
@@ -0,0 +1 @@
+mov ax, 1
diff --git a/mov-eax-1.asm b/mov-eax-1.asm
@@ -0,0 +1 @@
+mov eax, 1
diff --git a/mov-eax-1.md b/mov-eax-1.md
@@ -0,0 +1,36 @@
+# mov eax, 1
+
+Basic mov immediate instruction.
+
+Output:
+
+    b8 01 00 00 00
+    ^^ ^^^^^^^^^^^
+    1  2
+
+1.  Opcode
+2.  Immediate value: `1` in little endian
+
+Opcode bits:
+
+    1 0 1 1 1 0 0 0
+    ^^^^^^^^^ ^^^^^
+    1         2
+
+1. What to do.
+2. Where to move to. `000` is `eax`.
+
+Intel documentation says:
+
+-   Opcode: `B8 + rd id`.
+
+    `+rd` says that the 3 bits at the end are the destination register.
+
+    `id` says that a double word immediate follows.
+
+-   Op/En: `OI`.
+
+    The "Instruction Operand Encoding" table for `mov` and `OI` says:
+
+    Operand 1: `opcode + rd (w)`
+    Operand 2: `imm8/16/32/64`
diff --git a/mov-eax-ebx.asm b/mov-eax-ebx.asm
@@ -0,0 +1 @@
+mov eax, ebx
diff --git a/mov-eax-ebx.md b/mov-eax-ebx.md
@@ -0,0 +1,41 @@
+# mov eax, ebx
+
+Output:
+
+    89 d8
+    ^^ ^^
+    1  2
+
+1. Opcode
+1. ModR/M
+
+Opcode bits:
+
+    1 0 0 0 1 0 0 1
+    ^^^^^^^^^^^ ^ ^
+    1           2 3
+
+1. This is a `mov`.
+2. Move REG to R/M as represented on the ModR/M byte. Otherwise, other way around.
+3. 32-bit operands. Otherwise, 8-bit.
+
+ModR/M bits:
+
+    1 1 0 1 1 0 0 0
+    ^^^ ^^^^^ ^^^^^
+    1   2     3
+
+1. MOD = 3: REG and R/M are registers.
+2. REG = 3: EBX
+3. REG = 0: EAX
+
+So from the opcode, we move REG (EBX) into R/M (EAX).
+
+Note that two encodings are possible on reg / reg operations: we could swap the before last bit to 1 and both registers.
+
+Both possible encodings are documented on the instruction table:
+
+    01 /r    MOV r/m32, r32
+    03 /r    MOV r32, r/m32
+
+`/r` says that a MOdR/M follows the opcode, and that the 2 last bits describe it.
diff --git a/mov-eax-x-val.asm b/mov-eax-x-val.asm
@@ -0,0 +1,4 @@
+; mov value at address
+mov eax, [x]
+x:
+db0 db 0xFF
diff --git a/mov-eax-x.asm b/mov-eax-x.asm
@@ -0,0 +1,4 @@
+; mov address
+mov eax, x
+x:
+db0 db 0xFF
diff --git a/mov-ebx-1.asm b/mov-ebx-1.asm
@@ -0,0 +1,2 @@
+; See how ebx is encoded.
+mov ebx, 1
diff --git a/mov-ecx-1.asm b/mov-ecx-1.asm
@@ -0,0 +1,2 @@
+; See how ecx is encoded.
+mov ecx, 1
diff --git a/nop.asm b/nop.asm
@@ -0,0 +1 @@
+nop
diff --git a/nop.md b/nop.md
@@ -0,0 +1,5 @@
+# nop
+
+`0x90` is the simple form.
+
+But also has other multi-byte forms that can be used for alignment.
diff --git a/push-ebp.asm b/push-ebp.asm
@@ -0,0 +1 @@
+push ebp
diff --git a/push-ebp.md b/push-ebp.md
@@ -0,0 +1,18 @@
+# push ebp
+
+Output:
+
+    55
+
+Which is a single opcode.
+
+The opcode can be further decomposed into the following bits:
+
+    0 1 0 1 0 1 0 1
+    ^^^^^^^^^ ^^^^^
+    1         2
+
+1.  It is a push `push`.
+2.  From where we will push. `101` is ebp.
+
+This is documented as: opcode == `50+rd` in the Intel manual. The `+rd` part says that the 3 last bits indicate where to push from.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+; See how ebx is encoded.`
	`2`	`+mov ebx, 1`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+; See how ecx is encoded.`
	`2`	`+mov ecx, 1`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +# nop
++
 +`0x90` is the simple form.
++
 +But also has other multi-byte forms that can be used for alignment.