7 files changed, 325 insertions, 199 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..10a154b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,14 @@
+Copyright © 2021 Andrew Chambers
+Copyright © 2017-2020 Michael Forney
+
+Permission to use, copy, modify, and/or distribute this software for any purpose
+with or without fee is hereby granted, provided that the above copyright notice
+and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
+THIS SOFTWARE.
+\ No newline at end of file
diff --git a/README.md b/README.md
index 1f960e5..e881091 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,14 @@
 # minias
 
-A mini assembler for x86_64, written for fun and learning.
+A mini assembler for x86-64, written for fun and learning.
 
-Goals:
+Minias can assemble large amounts of real world software after they have been compiled with the [cproc C compiler](https://sr.ht/~mcf/cproc/). It can also
+assemble self hosted cproc, meaning it can indirectly assemble itself.
+
+Project Goals:
 
 - A simple, tiny, fast implementation (in that order).
-- Assemble the output of [cproc](https://github.com/michaelforney/cproc)/[qbe](https://c9x.me/compile/) and [chibicc](https://github.com/rui314/chibicc).
+- Assemble the output of [cproc](https://sr.ht/~mcf/cproc/)/[qbe](https://c9x.me/compile/) and [chibicc](https://github.com/rui314/chibicc).
 - Relocatable elf output.
 
 Non Goals:
@@ -29,26 +32,53 @@ leg asm.peg > asm.peg.inc
 cc -O2 *.c -o minias
 ```
 
+# Roadmap
+
+Essential features:
+
+- [x] Self host with cproc.
+- [ ] Self host with chibicc.
+
+Bonus features:
+
+- [ ] A man page explaining what is supported.
+- [x] Two pass jump relaxing.
+- [ ] Immediate relaxing.
+- [ ] Simple immediate expressions.
+- [ ] Assemble a libc.
+- [ ] Test every opcode with all variants in our test suite.
+- [ ] Parser that doesn't depend on peg/leg.
+
 # Notes
 
-- Minias deliberately does not free data as it all is
-  freed by the OS at the end of execution. In the future
-  we one ould use an arena allocator for minias and still
-  avoid manual calls to free.
+- The implementation deliberately does not free allocated memory as it all is
+  freed by the OS at the end of execution. Memory usage is still
+  quite light as it uses string and value interning. In the future
+  we could use an arena allocator for minias and still avoid manual calls to free.
+
+- The implementation deliberately uses global variables in a style similar class members in C++.
+  This is a more traditional unix style where the unit of data encapsulation is a
+  small program. This choice makes sense given we don't aim to build a library.
 
-- Minias deliberately kept the peg grammar quite repetitive
-  and simple, please keep it this.
+- Minias deliberately keeps the peg grammar quite repetitive
+  and simple, please keep it this way.
 
-- Our performance is quite fast, but with the current design
-  it is limited by the parser, it would be interesting
-  to see if we can improve the parser generator upstream. 
+- Performance is limited by the parser, it would be interesting
+  to see if we can improve the parser generator upstream. That being said,
+  performance is often better than gnu as and much better than the clang assembler,
 
 - One day it would be nice to write a 'minipeg' in a single .c
   file that can be bundled in projects.
 
+# Contact
+
+Ask questions on the [mailing list](https://lists.sr.ht/~ach/minias).
+Submit bugs to the [ticket system](https://todo.sr.ht/~ach/minias).
+
 # Resources
 
 - [intel reference](https://software.intel.com/content/dam/develop/external/us/en/documents-tps/325383-sdm-vol-2abcd.pdf) - Specifically chapter 2.1 and chapter 3.1.
 - [elf spec](https://refspecs.linuxfoundation.org/elf/elf.pdf)
+- [osdev wiki](https://wiki.osdev.org/X86-64_Instruction_Encoding)
 - [goas](https://github.com/DQNEO/goas)
 - [neatas](https://repo.or.cz/neatas.git)
diff --git a/asm.peg b/asm.peg
index fc904d5..88f5324 100644
--- a/asm.peg
+++ b/asm.peg
@@ -5,13 +5,10 @@ line  =
   | . { yy->v.kind = ASM_SYNTAX_ERROR; }
 
 ws = ([ \t]+ | comment)+
-
 comment = "/*" ( ! "\n" ! "*/" . )* "*/"   # No support for multiline comments for now as they break our line numbers.
-
 eolcomment = '#' (! "\n" .)+ 
 eol = ws? eolcomment? "\n"
 
-
 stmt =
     '.' d:directive eol {$$ = d;}
   | i:instr eol { $$ = i; }
@@ -66,25 +63,50 @@ label =
   { $$.label = (Label){.kind = ASM_LABEL, .name = i.charptr}; }
 
 instr = 
-  # Ordered by relative frequency for performance.
-  
-  # Movs are very common, so they come first.
+  # Ordered by instruction frequency for performance.
+  # e.g. movs are very common, so they come first.
+  # The & operator means check without consuming input.
   (& 'm'
     (
-        i:mov       { $$ = i; }
-      | i:movsx     { $$ = i; }
-      | i:movzx     { $$ = i; }
-      | i:mul       { $$ = i; }
-      # Less common, but we have already checked for 'm'
-      | i:movaps    { $$ = i; }
-      | i:movq      { $$ = i; }
-      | i:movsd     { $$ = i; }
-      | i:movss     { $$ = i; }
-      | i:mulsd     { $$ = i; }
-      | i:mulss     { $$ = i; }))
-  | i:add       { $$ = i; }
-  | i:and       { $$ = i; }
-  | i:cmp       { $$ = i; }
+        i:mov    { $$ = i; }
+      | i:movsx  { $$ = i; }
+      | i:movzx  { $$ = i; }
+      | i:mul    { $$ = i; }
+      | i:movaps { $$ = i; }
+      | i:movq   { $$ = i; }
+      | i:movsd  { $$ = i; }
+      | i:movss  { $$ = i; }
+      | i:mulsd  { $$ = i; }
+      | i:mulss  { $$ = i; }))
+  | (& 'a'
+      (
+          i:add   { $$ = i; }
+        | i:and   { $$ = i; }
+        | i:addss { $$ = i; }
+        | i:addsd { $$ = i; } ))
+  | (& 'c'
+      (
+          i:cmp       { $$ = i; }
+        | i:call      { $$ = i; }
+        | i:cvtsi2sd  { $$ = i; }
+        | i:cvtsi2ss  { $$ = i; }
+        | i:cvtss2sd  { $$ = i; }
+        | i:cvtsd2ss  { $$ = i; }
+        | i:cvttsd2si { $$ = i; }
+        | i:cvttss2si { $$ = i; }
+        | i:cltd      { $$ = i; }
+        | i:cqto      { $$ = i; }))
+  | (& 's'
+      (
+          i:set   { $$ = i; }
+        | i:sub   { $$ = i; }
+        | i:sal   { $$ = i; }
+        | i:sar   { $$ = i; }
+        | i:shl   { $$ = i; }
+        | i:shr   { $$ = i; }
+        | i:subsd { $$ = i; }
+        | i:subss { $$ = i; }))
+  | i:or        { $$ = i; }
   | i:leave     { $$ = i; } 
   | i:ret       { $$ = i; }
   | i:push      { $$ = i; }
@@ -95,44 +117,18 @@ instr =
   | i:lea       { $$ = i; }
   | i:imul      { $$ = i; }
   | i:neg       { $$ = i; }
-  | i:or        { $$ = i; }
-  | (& 's'
-      (
-          i:set       { $$ = i; }
-        | i:sub       { $$ = i; }
-        | i:sal       { $$ = i; }
-        | i:sar       { $$ = i; }
-        | i:shl       { $$ = i; }
-        | i:shr       { $$ = i; }))
   | i:test      { $$ = i; }
   | i:xchg      { $$ = i; }
   | i:xor       { $$ = i; }
-  | i:call      { $$ = i; }
-  # Misc
-  | i:cltd      { $$ = i; }
-  | i:cqto      { $$ = i; }
-  | i:nop       { $$ = i; }
   # Floating point is less common, so check last.
-  | i:addss     { $$ = i; }
-  | i:addsd     { $$ = i; }
   | i:divss     { $$ = i; }
   | i:divsd     { $$ = i; }
   | i:pxor      { $$ = i; }
   | i:xorpd     { $$ = i; }
   | i:xorps     { $$ = i; }
-  | i:subsd     { $$ = i; }
-  | i:subss     { $$ = i; }
   | i:ucomisd   { $$ = i; }
   | i:ucomiss   { $$ = i; }
-  | (& 'c' 
-      (
-          i:cvtsi2sd  { $$ = i; }
-        | i:cvtsi2ss  { $$ = i; }
-        | i:cvtss2sd  { $$ = i; }
-        | i:cvtsd2ss  { $$ = i; }
-        | i:cvttsd2si { $$ = i; }
-        | i:cvttss2si { $$ = i; }))
-
+  | i:nop       { $$ = i; }
 
 cltd =  "cltd"  { $$ = (Parsev){ .kind=ASM_CLTD  }; }
 cqto =  "cqto"  { $$ = (Parsev){ .kind=ASM_CQTO  }; }
@@ -140,17 +136,15 @@ leave = "leave" { $$ = (Parsev){ .kind=ASM_LEAVE }; }
 nop =   "nop"   { $$ = (Parsev){ .kind=ASM_NOP   }; }
 ret  =  "ret"   { $$ = (Parsev){ .kind=ASM_RET   }; }
 
-push = 
-  "push" (
-      'q'? ws  s:r64 { $$ = INSTR1(0, s); }
-    | 'q'  ws  s:mem { $$ = INSTR1(1, s); }
-  ) { $$.instr.kind = ASM_PUSH; }
+push = "push" (
+    'q'? ws  s:r64 { $$ = INSTR1(0, s); }
+  | 'q'  ws  s:mem { $$ = INSTR1(1, s); }
+) { $$.instr.kind = ASM_PUSH; }
 
-pop = 
-  "pop" (
-      'q'? ws d:r64 { $$ = INSTR1(0, d); }
-    | 'q'  ws d:mem { $$ = INSTR1(1, d); }
-  ) { $$.instr.kind = ASM_POP; }
+pop = "pop" (
+    'q'? ws d:r64 { $$ = INSTR1(0, d); }
+  | 'q'  ws d:mem { $$ = INSTR1(1, d); }
+) { $$.instr.kind = ASM_POP; }
 
 call = "call" 'q'? ws (
     '*' t:mem
@@ -175,20 +169,22 @@ condition-code =
     | "pe"  { $$.i64 = 3; }
     | "p"   { $$.i64 = 4; }
     | "o"   { $$.i64 = 5; }
-    | "nz"  { $$.i64 = 6; }
-    | "ns"  { $$.i64 = 7; }
-    | "np"  { $$.i64 = 8; }
-    | "no"  { $$.i64 = 9; }
-    | "nle" { $$.i64 = 10; }
-    | "nl"  { $$.i64 = 11; }
-    | "nge" { $$.i64 = 12; }
-    | "ng"  { $$.i64 = 13; }
-    | "ne"  { $$.i64 = 14; }
-    | "nc"  { $$.i64 = 15; }
-    | "nbe" { $$.i64 = 16; }
-    | "nb"  { $$.i64 = 17; }
-    | "nae" { $$.i64 = 18; }
-    | "na"  { $$.i64 = 19; }
+    | ("n"
+        (
+            "z"  { $$.i64 = 6; }
+          | "s"  { $$.i64 = 7; }
+          | "p"  { $$.i64 = 8; }
+          | "o"  { $$.i64 = 9; }
+          | "le" { $$.i64 = 10; }
+          | "l"  { $$.i64 = 11; }
+          | "ge" { $$.i64 = 12; }
+          | "g"  { $$.i64 = 13; }
+          | "e"  { $$.i64 = 14; }
+          | "c"  { $$.i64 = 15; }
+          | "be" { $$.i64 = 16; }
+          | "b"  { $$.i64 = 17; }
+          | "ae" { $$.i64 = 18; }
+          | "a"  { $$.i64 = 19; }))
     | "le"  { $$.i64 = 20; }
     | "l"   { $$.i64 = 21; }
     | "ge"  { $$.i64 = 22; }
@@ -429,22 +425,22 @@ test = "test" (
 
 addsd = "addsd" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_ADDSD; }
 
 addss = "addss" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_ADDSS; }
 
 subsd = "subsd" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_SUBSD; }
 
 subss = "subss" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_SUBSS; }
 
 cvtsi2sd = "cvtsi2sd" (
@@ -463,12 +459,12 @@ cvtsi2ss = "cvtsi2ss" (
 
 cvtss2sd = "cvtss2sd" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_CVTSS2SD; }
 
 cvtsd2ss = "cvtsd2ss" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_CVTSD2SS; }
 
 cvttss2si = "cvttss2si" (
@@ -492,7 +488,7 @@ divsd = "divsd" (
 
 divss = "divss" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_DIVSS; }
 
 movaps = "movaps" (
@@ -503,12 +499,12 @@ movaps = "movaps" (
 
 mulsd = "mulsd" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_MULSD; }
 
 mulss = "mulss" (
     ws s:xmm ws? ',' ws? d:xmm { $$ = INSTR2(0, s, d); }
-  | ws s:mem   ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
+  | ws s:mem ws? ',' ws? d:xmm { $$ = INSTR2(1, s, d); }
 )  { $$.instr.kind = ASM_MULSS; }
 
 movss = "movss" (
@@ -667,24 +663,24 @@ r64 = "%r" (
   | "15" ![lwb] { $$ = REG(R15); }
 )
 
-xmm = "%x" (
+xmm = "%xmm" (
   # Reverse order due to peg ordering.
-    "mm15" { $$ = REG(XMM15); }
-  | "mm14" { $$ = REG(XMM14); }
-  | "mm13" { $$ = REG(XMM13); }
-  | "mm12" { $$ = REG(XMM12); }
-  | "mm11" { $$ = REG(XMM11); }
-  | "mm10" { $$ = REG(XMM10); }
-  | "mm9"  { $$ = REG(XMM7); }
-  | "mm8"  { $$ = REG(XMM7); }
-  | "mm7"  { $$ = REG(XMM7); }
-  | "mm6"  { $$ = REG(XMM6); }
-  | "mm5"  { $$ = REG(XMM5); }
-  | "mm4"  { $$ = REG(XMM4); }
-  | "mm3"  { $$ = REG(XMM3); }
-  | "mm2"  { $$ = REG(XMM2); }
-  | "mm1"  { $$ = REG(XMM1); }
-  | "mm0"  { $$ = REG(XMM0); }
+    "15" { $$ = REG(XMM15); }
+  | "14" { $$ = REG(XMM14); }
+  | "13" { $$ = REG(XMM13); }
+  | "12" { $$ = REG(XMM12); }
+  | "11" { $$ = REG(XMM11); }
+  | "10" { $$ = REG(XMM10); }
+  | "9"  { $$ = REG(XMM7); }
+  | "8"  { $$ = REG(XMM7); }
+  | "7"  { $$ = REG(XMM7); }
+  | "6"  { $$ = REG(XMM6); }
+  | "5"  { $$ = REG(XMM5); }
+  | "4"  { $$ = REG(XMM4); }
+  | "3"  { $$ = REG(XMM3); }
+  | "2"  { $$ = REG(XMM2); }
+  | "1"  { $$ = REG(XMM1); }
+  | "0"  { $$ = REG(XMM0); }
 )
 
 # We disallow newlines in our strings, it is simpler for lineno tracking.
diff --git a/main.c b/main.c
index b47fbec..c444cc8 100644
--- a/main.c
+++ b/main.c
@@ -1,9 +1,11 @@
 #include "minias.h"
-#include <getopt.h>
 
 /* Parsed assembly */
 static AsmLine *allasm = NULL;
 
+/* Number of assembly the relaxation passes. */
+static int nrelax = 1;
+
 /* Symbol table. */
 static struct hashtable *symbols = NULL;
 
@@ -29,7 +31,7 @@ static Section *datarel = NULL;
 static char *infilename = "<stdin>";
 static size_t curlineno = 0;
 
-void lfatal(const char *fmt, ...) {
+static void lfatal(const char *fmt, ...) {
   va_list ap;
   fprintf(stderr, "%s:%ld: ", infilename, curlineno);
   va_start(ap, fmt);
@@ -45,8 +47,11 @@ static Symbol *getsym(const char *name) {
   htabkey(&htk, name, strlen(name));
   ps = (Symbol **)htabput(symbols, &htk);
   if (!*ps) {
-    *ps = zalloc(sizeof(Symbol));
-    (*ps)->name = name;
+    *ps = xmalloc(sizeof(Symbol));
+    **ps = (Symbol){
+        .name = name,
+        .wco = -1,
+    };
   }
   s = *ps;
   return s;
@@ -155,7 +160,7 @@ static void initsections(void) {
   datarel->hdr.sh_entsize = sizeof(Elf64_Rela);
 }
 
-Relocation *newreloc() {
+static Relocation *newreloc() {
   if (nrelocs == reloccap) {
     reloccap = nrelocs ? nrelocs * 2 : 64;
     relocs = xreallocarray(relocs, reloccap, sizeof(Relocation));
@@ -208,15 +213,7 @@ static uint8_t regbits(AsmKind k) { return (k - (ASM_REG_BEGIN + 1)) % 16; }
 
 static uint8_t isreg64(AsmKind k) { return k >= ASM_RAX && k <= ASM_R15; }
 
-/* Register that requires the use of a rex prefix. */
-static uint8_t isrexreg(AsmKind k) {
-  return k > ASM_REG_BEGIN && k < ASM_REG_END &&
-         (regbits(k) & (1 << 3) || k == ASM_SPL || k == ASM_BPL ||
-          k == ASM_SIL || k == ASM_DIL);
-}
-
-/* Compose a rex prefix - See intel manual. */
-
+/* Rex opcode prefix. */
 typedef struct Rex {
   uint8_t required : 1;
   uint8_t w : 1;
@@ -225,6 +222,13 @@ typedef struct Rex {
   uint8_t b : 1;
 } Rex;
 
+/* Register that requires the use of a rex prefix. */
+static uint8_t isrexreg(AsmKind k) {
+  return k > ASM_REG_BEGIN && k < ASM_REG_END &&
+         (regbits(k) & (1 << 3) || k == ASM_SPL || k == ASM_BPL ||
+          k == ASM_SIL || k == ASM_DIL);
+}
+
 static uint8_t rexbyte(Rex rex) {
   return ((1 << 6) | (rex.w << 3) | (rex.r << 2) | (rex.x << 1) | rex.b);
 }
@@ -379,7 +383,7 @@ static void assemblemem(const Memarg *memarg, Rex rex, VarBytes prefix,
     assemblemodregrm(rex, prefix, opcode, mod, reg, rm);
 
     if (mod == 1) {
-      assemblereloc(memarg->disp.l, memarg->disp.c, 1, R_X86_64_8);
+      assembleconstant(memarg->disp.c, 1);
     } else if (mod == 2) {
       assemblereloc(memarg->disp.l, memarg->disp.c, 4, R_X86_64_32);
     }
@@ -439,7 +443,7 @@ static void assemblemem(const Memarg *memarg, Rex rex, VarBytes prefix,
   sb(sibbyte(scale, index, base));
 
   if (mod == 1) {
-    assemblereloc(memarg->disp.l, memarg->disp.c, 1, R_X86_64_8);
+    assembleconstant(memarg->disp.c, 1);
   } else if (mod == 2) {
     assemblereloc(memarg->disp.l, memarg->disp.c, 4, R_X86_64_32);
   }
@@ -559,7 +563,7 @@ static void assemblexchg(const Instr *xchg) {
     rex = (Rex){
         .required = isrexreg(xchg->arg1->kind) || isrexreg(xchg->arg2->kind),
         .w = isreg64(xchg->arg1->kind) || isreg64(xchg->arg2->kind),
-        .r = !!(regbits(reg) & (1 << 3)),
+        .b = !!(regbits(reg) & (1 << 3)),
     };
     assembleplusr(rex, prefix, opcode, regbits(reg));
   } else {
@@ -740,6 +744,82 @@ static void assembleset(const Instr *instr) {
   }
 }
 
+static void assemblecall(const Call *call) {
+  Rex rex;
+  uint8_t rm;
+
+  if (call->indirect) {
+    if (call->target.indirect->kind == ASM_MEMARG) {
+      rex = (Rex){0};
+      assemblemem(&call->target.indirect->memarg, rex, -1, 0xff, 0x02);
+    } else {
+      rm = regbits(call->target.indirect->kind);
+      rex = (Rex){.b = !!(rm & (1 << 3))};
+      assemblemodregrm(rex, -1, 0xff, 0x03, 0x02, rm);
+    }
+  } else {
+    sb(0xe8);
+    assemblereloc(call->target.direct.l, call->target.direct.c - 4, 4,
+                  R_X86_64_PC32);
+  }
+}
+
+static void assembleimul(const Instr *instr) {
+  VarBytes prefix, opcode;
+
+  if (instr->variant < 8) {
+    assembledivmulneg(instr, 0x05);
+  } else if (instr->variant < 14) {
+    opcode = 0x01000faf;
+    prefix = ((instr->variant - 8) % 3) == 0 ? 0x66 : -1;
+    assemblerrm(instr, prefix, opcode, 1);
+  } else {
+    const Imm *imm;
+    imm = &instr->arg3->imm;
+    opcode = 0x69;
+    prefix = ((instr->variant - 14) % 3) == 0 ? 0x66 : -1;
+    assemblerrm(instr, prefix, opcode, 1);
+    assemblereloc(imm->v.l, imm->v.c, imm->nbytes, R_X86_64_32);
+  }
+}
+
+static void assemblejmp(const Jmp *j) {
+  int jmpsize;
+  int64_t distance;
+  Symbol *target;
+
+  static uint8_t variant2op[31] = {
+      0xe9, 0x84, 0x88, 0x8b, 0x8a, 0x8a, 0x80, 0x85, 0x89, 0x8b, 0x81,
+      0x8f, 0x8d, 0x8c, 0x8e, 0x85, 0x83, 0x87, 0x83, 0x82, 0x86, 0x8e,
+      0x8c, 0x8d, 0x8f, 0x84, 0x82, 0x86, 0x82, 0x83, 0x87,
+  };
+
+  jmpsize = 4;
+  target = getsym(j->target);
+  if (cursection == target->section && (target->defined || target->wco != -1)) {
+    if (target->defined) {
+      distance = target->offset - cursection->hdr.sh_size;
+    } else {
+      distance = target->wco - cursection->hdr.sh_size;
+    }
+    if ((distance - 1) >= -128 && (distance - 1) <= 127) {
+      jmpsize = 1;
+    } else {
+      jmpsize = 4;
+    }
+  }
+
+  if (jmpsize == 4) {
+    if (j->variant)
+      sb(0x0f);
+    sb(variant2op[j->variant]);
+    assemblereloc(j->target, -4, 4, R_X86_64_PC32);
+  } else {
+    sb(variant2op[j->variant] + (j->variant ? -16 : 2));
+    assemblereloc(j->target, -1, 1, R_X86_64_PC8);
+  }
+}
+
 static void assemble(void) {
   Symbol *sym;
   AsmLine *l;
@@ -847,38 +927,12 @@ static void assemble(void) {
         lfatal("%s already defined", sym->name);
       sym->defined = 1;
       break;
-    case ASM_CALL: {
-      Rex rex;
-      uint8_t rm;
-
-      if (v->call.indirect) {
-        if (v->call.target.indirect->kind == ASM_MEMARG) {
-          rex = (Rex){0};
-          assemblemem(&v->call.target.indirect->memarg, rex, -1, 0xff, 0x02);
-        } else {
-          rm = regbits(v->call.target.indirect->kind);
-          rex = (Rex){.b = !!(rm & (1 << 3))};
-          assemblemodregrm(rex, -1, 0xff, 0x03, 0x02, rm);
-        }
-      } else {
-        sb(0xe8);
-        assemblereloc(v->call.target.direct.l, v->call.target.direct.c - 4, 4,
-                      R_X86_64_PC32);
-      }
+    case ASM_CALL:
+      assemblecall(&v->call);
       break;
-    }
-    case ASM_JMP: {
-      static uint8_t variant2op[31] = {
-          0xe9, 0x84, 0x88, 0x8b, 0x8a, 0x8a, 0x80, 0x85, 0x89, 0x8b, 0x81,
-          0x8f, 0x8d, 0x8c, 0x8e, 0x85, 0x83, 0x87, 0x83, 0x82, 0x86, 0x8e,
-          0x8c, 0x8d, 0x8f, 0x84, 0x82, 0x86, 0x82, 0x83, 0x87,
-      };
-      if (v->jmp.variant)
-        sb(0x0f);
-      sb(variant2op[v->jmp.variant]);
-      assemblereloc(v->jmp.target, -4, 4, R_X86_64_PC32);
+    case ASM_JMP:
+      assemblejmp(&v->jmp);
       break;
-    }
     case ASM_PUSH: {
       Rex rex;
       uint8_t reg;
@@ -1054,25 +1108,9 @@ static void assemble(void) {
     case ASM_MULSS:
       assemblerrm(&v->instr, 0xf3, 0x01000f59, 1);
       break;
-    case ASM_IMUL: {
-      VarBytes prefix, opcode;
-
-      if (v->instr.variant < 8) {
-        assembledivmulneg(&v->instr, 0x05);
-      } else if (v->instr.variant < 14) {
-        opcode = 0x01000faf;
-        prefix = ((v->instr.variant - 8) % 3) == 0 ? 0x66 : -1;
-        assemblerrm(&v->instr, prefix, opcode, 1);
-      } else {
-        const Imm *imm;
-        imm = &v->instr.arg3->imm;
-        opcode = 0x69;
-        prefix = ((v->instr.variant - 14) % 3) == 0 ? 0x66 : -1;
-        assemblerrm(&v->instr, prefix, opcode, 1);
-        assemblereloc(imm->v.l, imm->v.c, imm->nbytes, R_X86_64_32);
-      }
+    case ASM_IMUL:
+      assembleimul(&v->instr);
       break;
-    }
     case ASM_NEG:
       assembledivmulneg(&v->instr, 0x03);
       break;
@@ -1085,10 +1123,9 @@ static void assemble(void) {
       assemblebasicop(&v->instr, variant2op[v->instr.variant], 0x01);
       break;
     }
-    case ASM_PXOR: {
+    case ASM_PXOR:
       assemblerrm(&v->instr, 0x66, 0x01000fef, 1);
       break;
-    }
     case ASM_SET:
       assembleset(&v->instr);
       break;
@@ -1152,6 +1189,32 @@ static void assemble(void) {
   }
 }
 
+/* Reset while remembering symbol offsets so we can size jumps. */
+static void relaxreset(void) {
+  Symbol *sym;
+  Section *sec;
+  size_t i;
+
+  /* Reset relocations and section data but retain capacity. */
+  nrelocs = 0;
+
+  for (i = 0; i < nsections; i++) {
+    sec = &sections[i];
+    if (sec == shstrtab)
+      continue;
+    sec->hdr.sh_size = 0;
+  }
+
+  /* Reset symbols, saving the worst case offset for the second pass. */
+  for (i = 0; i < symbols->cap; i++) {
+    if (!symbols->keys[i].str)
+      continue;
+    sym = symbols->vals[i];
+    *sym = (Symbol){
+        .name = sym->name, .section = sym->section, .wco = sym->offset};
+  }
+}
+
 static void addtosymtab(Symbol *sym) {
   Elf64_Sym elfsym;
   int stype;
@@ -1216,10 +1279,14 @@ static int resolvereloc(Relocation *reloc) {
     return 0;
 
   switch (reloc->type) {
-  case R_X86_64_8:
   case R_X86_64_32:
   case R_X86_64_64:
     return 0;
+  case R_X86_64_PC8:
+    rdata = &reloc->section->data[reloc->offset];
+    value = sym->offset - reloc->offset + reloc->addend;
+    rdata[0] = ((uint8_t)value & 0xff);
+    return 1;
   case R_X86_64_PC32:
     rdata = &reloc->section->data[reloc->offset];
     value = sym->offset - reloc->offset + reloc->addend;
@@ -1255,7 +1322,6 @@ static void appendreloc(Relocation *reloc) {
   case R_X86_64_PC32:
   case R_X86_64_32:
   case R_X86_64_64:
-  case R_X86_64_8:
     elfrel.r_info = ELF64_R_INFO(sym->idx, reloc->type);
     elfrel.r_offset = reloc->offset;
     elfrel.r_addend = reloc->addend;
@@ -1325,8 +1391,11 @@ static void outelf(void) {
 }
 
 static void usage(char *argv0) {
-  fprintf(stderr, "minias - a mini assembler.");
-  fprintf(stderr, "usage: %s [-o out] [input]\n", argv0);
+  fprintf(stderr, "minias - a mini x86-64 assembler.\n\n");
+  fprintf(stderr, "usage: %s [-r passes] [-o out] [input]\n", argv0);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "  -r passes  Jump relaxation iterations (default 1).\n");
+  fprintf(stderr, "  -o out     Output file to write (default stdout).\n");
   exit(2);
 }
 
@@ -1344,6 +1413,9 @@ static void parseargs(int argc, char *argv[]) {
       case 'h':
         usage(argv0);
         break;
+      case 'r':
+        nrelax = atoi(*++argv);
+        break;
       case 'o':
         if (argv[1] == NULL)
           usage(argv0);
@@ -1372,6 +1444,10 @@ int main(int argc, char *argv[]) {
   allasm = parseasm();
   initsections();
   assemble();
+  while (nrelax-- > 0) {
+    relaxreset();
+    assemble();
+  }
   fillsymtab();
   handlerelocs();
   outelf();
diff --git a/minias.h b/minias.h
index 020de41..5914c1b 100644
--- a/minias.h
+++ b/minias.h
@@ -24,6 +24,7 @@ typedef struct {
   const char *name;
   int32_t idx;
   int64_t offset;
+  int64_t wco; /* worst case offset */
   int64_t size;
   int global;
   int defined;
diff --git a/parse.c b/parse.c
index 8704298..1634bb6 100644
--- a/parse.c
+++ b/parse.c
@@ -8,11 +8,10 @@ static const Parsev *internparsev(Parsev *p) {
      for equality, even on pointer values, this works because the
      pointers themselves are also interned.
 
-     This simplicity somes with one big cost - Parsev variants with padding
-     can trigger a false positive on valgrind. It should still safe
-     because reading these undefined bytes do not change the behavior of the
-     program. The best fix is still to avoid the padding bytes in the Parsev
-     variant layout using a tool such as 'pahole'.
+     This simplicity comes with one big cost - Parsev variants with padding
+     can trigger a false positive on valgrind. It should still be safe,
+     but the best fix is still to avoid the padding bytes in the Parsev
+     variants.
   */
   size_t idx;
   const Parsev *interned;
diff --git a/test/test.sh b/test/test.sh
index 7ee3855..7b0e41c 100644
--- a/test/test.sh
+++ b/test/test.sh
@@ -8,7 +8,7 @@ tmpb="$(mktemp)"
 trap "rm -f \"$tmps\" \"$tmpo\" \"$tmpb\"" EXIT
 
 t () {
-  echo "$1" > "$tmps"
+  echo -e "$1" > "$tmps"
   clang -Wno-everything -c -x assembler "$tmps" -o "$tmpo"
   objcopy -j ".text" -O binary "$tmpo" "$tmpb"
   want="$(xxd -ps "$tmpb" | head -n 1 | cut  -d ' ' -f 2-)"
@@ -30,6 +30,18 @@ t () {
   echo -n "."
 }
 
+# Various regression tests first.
+t "xchgq %r13, %rax"
+t "movl \$1000, %r8d"
+t "movb %sil, (%rdi)"
+t "movsbq (%rax), %rbx"
+t "movq $-4132994306676758123, %rcx"
+t "mov \$17293822569102704639, %rax"
+t "callq *%rax"
+t "callq *%r10"
+t "movb %r11b, (%rsi, %r12, 1)"
+
+
 for r in rax r10
 do
   for x in xmm0 xmm13
@@ -41,19 +53,6 @@ do
   done
 done
 
-t "movl \$1000, %r8d"
-
-t "movb %sil, (%rdi)"
-
-t "movsbq (%rax), %rbx"
-
-t "movq $-4132994306676758123, %rcx"
-t "mov \$17293822569102704639, %rax"
-
-t "callq *%rax"
-t "callq *%r10"
-
-t "movb %r11b, (%rsi, %r12, 1)"
 
 t "cvttsd2si %xmm1, %rax"
 t "cvttsd2si %xmm10, %rax"
@@ -153,6 +152,17 @@ conditioncodes="
   o p pe po s z
 "
 
+for fill in 0 1 129
+do
+  t "l:\n .fill $fill, 1, 0x00 \njmp l"
+  t "jmp l\n .fill $fill, 1, 0x00 \nl:"
+  for cc in $conditioncodes
+  do
+    t "l:\n .fill $fill, 1, 0x00 \nj${cc} l"
+    t "j${cc} l\n .fill $fill, 1, 0x00 \nl:"
+  done
+done
+
 for cc in $conditioncodes
 do
   t "set${cc} %al"