From c2ff93e75e5f6df8e1679120b18f0d5884deab2b Mon Sep 17 00:00:00 2001 From: Roland Paterson-Jones Date: Wed, 19 Jun 2024 16:48:11 +0200 Subject: Global Value Numbering / Global Code Motion More or less as proposed in its ninth iteration with the addition of a gcmmove() functionality to restore coherent local schedules. Changes since RFC 8: Features: - generalization of phi 1/0 detection - collapse linear jmp chains before GVN; simplifies if-graph detection used in 0/non-0 value inference and if-elim... - infer 0/non-0 values from dominating blk jnz; eliminates redundant cmp eq/ne 0 and associated jnz/blocks, for example redundant null pointer checks (hare codebase likes this) - remove (emergent) empty if-then-else graphlets between GVN and GCM; improves GCM instruction placement, particularly cmps. - merge %addr =l add %addr1, N sequences - reduces tmp count, register pressure. - squash consecutive associative ops with constant args, e.g. t1 = add t, N ... t2 = add t2, M -> t2 = add t, N+M Bug Fixes: - remove "cmp eq/ne of non-identical RCon's " in copyref(). RCon's are not guaranteed to be dedup'ed, and symbols can alias. Codebase: - moved some stuff into cfg.c including blkmerge() - some refactoring in gvn.c - simplification of reassoc.c - always reassoc all cmp ops and Kl add %t, N. Better on coremark, smaller codebase. - minor simplification of movins() - use vins Testing - standard QBE, cproc, hare, harec, coremark [still have Rust build issues with latest roland] Benchmark - coremark is ~15%+ faster than master - hare "HARETEST_INCLUDE='slow' make check" ~8% faster (crypto::sha1::sha1_1gb is biggest obvious win - ~25% faster) Changes since RFC 7: Bug fixes: - remove isbad4gcm() in GVN/GCM - it is unsound due to different state at GVN vs GCM time; replace with "reassociation" pass after GCM - fix intra-blk use-before-def after GCM - prevent GVN from deduping trapping instructions cos GCM will not move them - remove cmp eq/ne identical arg copy detection for floating point, it is not valid for NaN - fix cges/cged flagged as commutative in ops.h instead of cnes/cned respectively; just a typo Minor features: - copy detection handles cmp le/lt/ge/gt with identical args - treat (integer) div/rem by non-zero constant as non-trapping - eliminate add N/sub N pairs in copy detection - maintain accurate tmp use in GVN; not strictly necessary but enables interim global state sanity checking - "reassociation" of trivial constant offset load/store addresses, and cmp ops with point-of-use in pass after GCM - normalise commutative op arg order - e.g. op con, tmp -> op tmp, con to simplify copy detection and GVN instruction dedup Codebase: - split out core copy detection and constant folding (back) out into copy.c, fold.c respectively; gvn.c was getting monolithic - generic support for instruction moving in ins.c - used by GCM and reassoc - new reassociation pass in reassoc.c - other minor clean-up/refactor Changes since RFC 6: - More ext elimination in GVN by examination of def and use bit width - elimination of redundant and mask by bit width examination - Incorporation of Song's patch Changes since RFC 5: - avoidance of "bad" candidates for GVN/GCM - trivial address offset calculations, and comparisons - more copy detection mostly around boolean values - allow elimination of unused load, alloc, trapping instructions - detection of trivial boolean v ? 1 : 0 phi patterns - bug fix for (removal of) "chg" optimisation in ins recreation - it was missing removal of unused instructions in some cases ifelim() between GVN and GCM; deeper nopunused() --- .gitignore | 1 + Makefile | 3 +- all.h | 77 ++++- cfg.c | 316 +++++++++++++++++++-- copy.c | 475 +++++++++++++++++++------------ fold.c | 347 ++--------------------- gcm.c | 507 +++++++++++++++++++++++++++++++++ gvn.c | 748 +++++++++++++++++++++++++++++++++++++++++++++++++ ins.c | 195 +++++++++++++ main.c | 12 +- ops.h | 291 ++++++++++--------- parse.c | 14 +- reassoc.c | 101 +++++++ ssa.c | 2 +- test/_gcm1.ssa | 48 ++++ test/_gcm2.ssa | 43 +++ test/_load-elim.ssa | 17 ++ test/gvn-live-dead.ssa | 19 ++ test/non0jnz.ssa | 31 ++ util.c | 56 ++++ 20 files changed, 2612 insertions(+), 691 deletions(-) create mode 100644 gcm.c create mode 100644 gvn.c create mode 100644 ins.c create mode 100644 reassoc.c create mode 100644 test/_gcm1.ssa create mode 100644 test/_gcm2.ssa create mode 100644 test/_load-elim.ssa create mode 100644 test/gvn-live-dead.ssa create mode 100644 test/non0jnz.ssa diff --git a/.gitignore b/.gitignore index afd08d7..09e4118 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ qbe config.h .comfile *.out +*~ diff --git a/Makefile b/Makefile index 2482eb1..85783be 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,8 @@ PREFIX = /usr/local BINDIR = $(PREFIX)/bin COMMOBJ = main.o util.o parse.o abi.o cfg.o mem.o ssa.o alias.o load.o \ - copy.o fold.o simpl.o live.o spill.o rega.o emit.o + copy.o fold.o gvn.o gcm.o ins.o simpl.o live.o \ + spill.o rega.o emit.o AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o ARM64OBJ = arm64/targ.o arm64/abi.o arm64/isel.o arm64/emit.o RV64OBJ = rv64/targ.o rv64/abi.o rv64/isel.o rv64/emit.o diff --git a/all.h b/all.h index a5a0163..ceb7520 100644 --- a/all.h +++ b/all.h @@ -191,6 +191,7 @@ enum { #define INRANGE(x, l, u) ((unsigned)(x) - l <= u - l) /* linear in x */ #define isstore(o) INRANGE(o, Ostoreb, Ostored) #define isload(o) INRANGE(o, Oloadsb, Oload) +#define isalloc(o) INRANGE(o, Oalloc4, Oalloc16) #define isext(o) INRANGE(o, Oextsb, Oextuw) #define ispar(o) INRANGE(o, Opar, Opare) #define isarg(o) INRANGE(o, Oarg, Oargv) @@ -214,8 +215,15 @@ struct Op { char *name; short argcls[2][4]; uint canfold:1; - uint hasid:1; - uint idval:1; /* identity value 0/1 */ + uint hasid:1; /* op identity value? */ + uint idval:1; /* identity value 0/1 */ + uint commutes:1; /* commutative op? */ + uint assoc:1; /* associative op? */ + uint idemp:1; /* idempotent op? */ + uint cmpeqwl:1; /* Kl/Kw cmp eq/ne? */ + uint cmplgtewl:1; /* Kl/Kw cmp lt/gt/le/ge? */ + uint eqval:1; /* 1 for eq; 0 for ne */ + uint ispinned:1; /* GCM pinned op? */ }; struct Ins { @@ -230,7 +238,8 @@ struct Phi { Ref *arg; Blk **blk; uint narg; - int cls; + short cls; + uint visit:1; Phi *link; }; @@ -251,6 +260,7 @@ struct Blk { Blk *idom; Blk *dom, *dlink; + int domdpth; Blk **fron; uint nfron; @@ -344,6 +354,9 @@ struct Tmp { Wuw } width; int visit; + uint gcmbid; + uint gcminsn; // TODO get rid + uint gcmdefinsn; // TODO get rid }; struct Con { @@ -450,6 +463,20 @@ struct Dat { char isstr; }; +typedef struct InsLoc InsLoc; + +struct InsLoc { + uint bid; + uint insn; +}; + +typedef struct InsMov InsMov; + +struct InsMov { + InsLoc from; + InsLoc to; +}; + /* main.c */ extern Target T; extern char debug['Z'+1]; @@ -476,6 +503,7 @@ char *str(uint32_t); int argcls(Ins *, int); int isreg(Ref); int iscmp(int, int *, int *); +int invcmpwl(int); void emit(int, int, Ref, Ref, Ref); void emiti(Ins); void idup(Blk *, Ins *, ulong); @@ -484,16 +512,19 @@ int cmpop(int); int cmpneg(int); int clsmerge(short *, short); int phicls(int, Tmp *); +uint phiargn(Phi *, Blk *); +Ref phiarg(Phi *, Blk *); Ref newtmp(char *, int, Fn *); void chuse(Ref, int, Fn *); int symeq(Sym, Sym); Ref newcon(Con *, Fn *); Ref getcon(int64_t, Fn *); int addcon(Con *, Con *, int); +int isconbits(Fn *fn, Ref r, int64_t *v); +int istmpconbits(Fn *fn, Ins *i, int64_t *v); void salloc(Ref, Ref, Fn *); void dumpts(BSet *, Tmp *, FILE *); void runmatch(uchar *, Num *, Ref, Ref *); - void bsinit(BSet *, uint); void bszero(BSet *); uint bscount(BSet *); @@ -525,7 +556,6 @@ void elimsb(Fn *); /* cfg.c */ Blk *newblk(void); -void edgedel(Blk *, Blk **); void fillpreds(Fn *); void fillcfg(Fn *); void filldom(Fn *); @@ -533,8 +563,20 @@ int sdom(Blk *, Blk *); int dom(Blk *, Blk *); void fillfron(Fn *); void loopiter(Fn *, void (*)(Blk *, Blk *)); +void filldomdpth(Fn *); +Blk *lca(Blk *, Blk *); void fillloop(Fn *); void simpljmp(Fn *); +void replacepreds(Blk *, Blk *, Blk *); +void killblks(Fn *); +void blkmerge(Fn *); +int ifgraph(Blk *, Blk **, Blk **, Blk **); +int ifjoin(Blk *, Blk **, Blk **, Blk **); +int emptyblk(Blk *); +void ifelim(Fn *); +void clrbvisit(Fn *); +int reaches(Fn *,Blk *, Blk *); +int reachesnotvia(Fn *, Blk *, Blk *, Blk *); /* mem.c */ void promote(Fn *); @@ -552,15 +594,36 @@ int storesz(Ins *); void loadopt(Fn *); /* ssa.c */ +void adduse(Tmp *, int, Blk *, ...); void filluse(Fn *); void ssa(Fn *); void ssacheck(Fn *); /* copy.c */ -void copy(Fn *); +int iswu1(Fn *, Ref); +void narrowpars(Fn *); +Ref copyref(Fn *, Blk *, Ins *); /* fold.c */ -void fold(Fn *); +Ref foldref(Fn *, Ins *); + +/* gvn.c */ +extern Ref con01[2]; +int is0non0(Fn *, Blk *, Ref, int, int *); +void gvn(Fn *); + +/* gcm.c */ +int isfixed(Fn *, Ins *); +void gcm(Fn *); + +/* ins.c */ +void addins(Ins **, uint *, Ins *); +void addbins(Blk *, Ins **, uint *); +void nopunused(Fn *); +void movins(Fn *, InsMov *, uint, int); + +/* reassoc.c */ +void reassoc(Fn *); /* simpl.c */ void simpl(Fn *); diff --git a/cfg.c b/cfg.c index 99b6f40..1fe584c 100644 --- a/cfg.c +++ b/cfg.c @@ -39,37 +39,6 @@ prunephis(Fn *f) } } -void -edgedel(Blk *bs, Blk **pbd) -{ - Blk *bd; - Phi *p; - uint a; - int mult; - - bd = *pbd; - mult = 1 + (bs->s1 == bs->s2); - *pbd = 0; - if (!bd || mult > 1) - return; - for (p=bd->phi; p; p=p->link) { - for (a=0; p->blk[a]!=bs; a++) - assert(a+1narg); - p->narg--; - memmove(&p->blk[a], &p->blk[a+1], - sizeof p->blk[0] * (p->narg-a)); - memmove(&p->arg[a], &p->arg[a+1], - sizeof p->arg[0] * (p->narg-a)); - } - if (bd->npred != 0) { - for (a=0; bd->pred[a]!=bs; a++) - assert(a+1npred); - bd->npred--; - memmove(&bd->pred[a], &bd->pred[a+1], - sizeof bd->pred[0] * (bd->npred-a)); - } -} - static void addpred(Blk *bp, Blk *bc) { @@ -298,6 +267,49 @@ multloop(Blk *hd, Blk *b) b->loop *= 10; } +void +filldomdpth(Fn *fn) +{ + Blk *b, *dom; + uint dpth; + + for (b=fn->start; b; b=b->link) + b->domdpth = -1; + + fn->start->domdpth = 0; + + for (b=fn->start; b; b=b->link) { + if (b->domdpth != -1) + continue; + dpth = 1; + for (dom = b->idom; dom->domdpth == -1; dom = dom->idom) + dpth++; + dpth += dom->domdpth; + b->domdpth = dpth; + for (dom = b->idom; dom->domdpth == -1; dom = dom->idom) + dom->domdpth = --dpth; + } +} + +/* least common ancestor in dom tree */ +Blk * +lca(Blk *b1, Blk *b2) +{ + if (!b1) + return b2; + if (!b2) + return b1; + while (b1->domdpth > b2->domdpth) + b1 = b1->idom; + while (b2->domdpth > b1->domdpth) + b2 = b2->idom; + while (b1 != b2) { + b1 = b1->idom; + b2 = b2->idom; + } + return b1; +} + void fillloop(Fn *fn) { @@ -358,3 +370,245 @@ simpljmp(Fn *fn) *p = ret; free(uf); } + +static void +replacepred(Blk **blks, uint nblk, Blk *to, Blk *from) +{ + uint n; + for(n=0; npred[] and p->blk[] entries */ +void +replacepreds(Blk *s, Blk *to, Blk *from) +{ + Phi *p; + + if (!s) + return; + assert(s->npred); + replacepred(s->pred, s->npred, to, from); + for (p = s->phi; p; p = p->link) { + assert(p->narg == s->npred); + replacepred(p->blk, p->narg, to, from); + } +} + +/* remove marked-dead blks - marked as fn->rpo[id] == 0 */ +void +killblks(Fn *fn) +{ + Blk **pb; + + for (pb = &fn->start; *pb;) + if (fn->rpo[(*pb)->id]) + pb = &(*pb)->link; + else + *pb = (*pb)->link; +} + +/* merge linear jmp chains */ +/* requires rpo pred, breaks cfg use */ +void +blkmerge(Fn *fn) +{ + uint bid, nins; + Blk *curb, *b; + Ins *vins; + + if (debug['B']) + fputs("\n> Block merge:\n", stderr); + + vins = vnew(0, sizeof vins[0], PFn); + curb = 0; + /* linear jmp chains will be consecutive in rpo */ + for (bid=0; bidnblk; bid++) { + b = fn->rpo[bid]; + if (curb == 0) { + curb = b; + nins = 0; + } else + fn->rpo[bid] = 0; + addbins(b, &vins, &nins); + /* note - there are cases where GVN does not eliminate singleton phis */ + if (b->jmp.type != Jjmp || b->s1->npred != 1 || b->s1->phi) { + idup(curb, vins, nins); + curb->nins = nins; + curb->jmp = b->jmp; + replacepreds(b->s1, curb, b); + if (b->s1 != b->s2) + replacepreds(b->s2, curb, b); + curb->s1 = b->s1; + curb->s2 = b->s2; + curb = 0; + } else { + assert(b->s1->id == bid+1); + if (debug['B']) + fprintf(stderr, " merging blocks @%s -> @%s\n", b->name, b->s1->name); + } + } + assert(curb == 0); + killblks(fn); +} + +int +lonesucc(Blk *b, Blk *s) +{ + assert(s); + if (s != b) + if (s->npred == 1) + if (s->pred[0] == b) + if (s->phi == 0) + return 1; + return 0; +} + +int +lonejmpsucc(Blk *b, Blk *s) +{ + return s->jmp.type == Jjmp && lonesucc(b, s); +} + +/* (otherwise) isolated if-then[-else] graphlet */ +int +ifgraph(Blk *ifb, Blk **pthenb, Blk **pelseb, Blk **pjoinb) +{ + if (ifb->jmp.type != Jjnz) + return 0; + assert(ifb->s1 && ifb->s2); + assert(ifb->s1 != ifb->s2); /* dubious */ + *pthenb = ifb->s1; + *pelseb = ifb->s2; + *pjoinb = ifb->s1->s1; + if (ifb->s1 == ifb->s2->s1) { + /* empty then */ + *pthenb = ifb; + *pjoinb = ifb->s1; + } + if (ifb->s1->s1 == ifb->s2) + /* empty else */ + *pelseb = ifb; + + if (*pthenb == ifb || + ((*pthenb)->s1 == *pjoinb && lonejmpsucc(ifb, *pthenb))) + if (*pelseb == ifb || + ((*pelseb)->s1 == *pjoinb && lonejmpsucc(ifb, *pelseb))) + /* there are cases where npred == 2 is not strictly needed - ifconvert for example */ + if ((*pjoinb)->npred == 2) + return 1; + + return 0; +} + +/* join blk of if-then[-else] graphlet */ +int +ifjoin(Blk *joinb, Blk **pifb, Blk **pthenb, Blk **pelseb) +{ + Blk *joinb1; + + if (joinb->npred) + if (ifgraph(joinb->pred[0], pthenb, pelseb, &joinb1)) + if (joinb == joinb1) { + *pifb = joinb->pred[0]; + return 1; + } + if (joinb->npred && joinb->pred[0]->npred) + if (ifgraph(joinb->pred[0]->pred[0], pthenb, pelseb, &joinb1)) + if (joinb == joinb1) { + *pifb = joinb->pred[0]->pred[0]; + return 1; + } + return 0; +} + +int +emptyblk(Blk *b) +{ + Ins *i; + + for (i = b->ins; i < &b->ins[b->nins]; i++) + if (i->op != Onop) + if (i->op != Odbgloc) + return 0; + return 1; +} + +/* remove empty jnz graphlets */ +/* needs rpo; breaks cfg */ +void +ifelim(Fn *fn) +{ + uint bid; + Blk *ifb, *thenb, *elseb, *joinb; + + for (bid = 0; bid < fn->nblk; bid++) { + ifb = fn->rpo[bid]; + if (ifb == 0) + continue; + if (ifgraph(ifb, &thenb, &elseb, &joinb)) + if (joinb->phi == 0) + if (thenb == ifb || emptyblk(thenb)) + if (elseb == ifb || emptyblk(elseb)) { + if (debug['B']) + fprintf(stderr, " eliminating empty if @%s -> @%s, @%s -> @%s\n", + ifb->name, thenb->name, elseb->name, joinb->name); + if (thenb != ifb) + fn->rpo[thenb->id] = 0; + if (elseb != ifb) + fn->rpo[elseb->id] = 0; + ifb->jmp.type = Jjmp; + ifb->jmp.arg = R; + ifb->s1 = joinb; + ifb->s2 = 0; + } + } + killblks(fn); +} + +void +clrbvisit(Fn *fn) +{ + Blk *b; + for (b = fn->start; b; b = b->link) + b->visit = 0; +} + +static int +reaches1(Blk *b1, Blk *b2) +{ + assert(b2); + if (b1 == b2) + return 1; + if (b1 == 0 || b1->visit) + return 0; + b1->visit = 1; + return reaches1(b1->s1, b2) || reaches1(b1->s2, b2); +} + +/* path from b1 to b2 not thru bnotvia? */ +/* uses b->visit */ +int +reachesnotvia(Fn *fn, Blk *b1, Blk *b2, Blk *bnotvia) +{ + int rc; + + if (bnotvia) + bnotvia->visit = 1; + rc = reaches1(b1, b2); + clrbvisit(fn); + return rc; +} + +/* path from b1 to b2? */ +/* uses b->visit */ +int +reaches(Fn *fn, Blk *b1, Blk *b2) +{ + return reachesnotvia(fn, b1, b2, 0); +} + diff --git a/copy.c b/copy.c index da3987f..95c7e00 100644 --- a/copy.c +++ b/copy.c @@ -1,217 +1,332 @@ #include "all.h" -static int -iscon(Ref r, int64_t bits, Fn *fn) +static uint +u64_wbits(uint64_t v) { - return rtype(r) == RCon - && fn->con[r.val].type == CBits - && fn->con[r.val].bits.i == bits; + uint n; + + n = 0; + if (v >> 32) { n += 32; v >>= 32; } + if (v >> 16) { n += 16; v >>= 16; } + if (v >> 8) { n += 8; v >>= 8; } + if (v >> 4) { n += 4; v >>= 4; } + if (v >> 2) { n += 2; v >>= 2; } + if (v >> 1) { n += 1; v >>= 1; } + return n+v; } static int -iscopy(Ins *i, Ref r, Fn *fn) +EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 }; + +static uint +EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 }; + +static uint +EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 }; + +static uint +STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 }; + +/* is the ref used only as a narrow value? */ +static int +usewidthle(Fn *fn, Ref r, uint wbits) { - static bits extcpy[] = { - [WFull] = 0, - [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw), - [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw), - [Wsh] = BIT(Wsh) | BIT(Wsw), - [Wuh] = BIT(Wuh) | BIT(Wuw), - [Wsw] = BIT(Wsw), - [Wuw] = BIT(Wuw), - }; - Op *op; - bits b; Tmp *t; + Use *u; + Phi *p; + int b; + Ins *i; + Ref rc; + int64_t v; - if (i->op == Ocopy) + if (isconbits(fn, r, &v)) + if (u64_wbits(v) <= wbits) return 1; - op = &optab[i->op]; - if (op->hasid && KBASE(i->cls) == 0) - return iscon(i->arg[1], op->idval, fn); - if (!isext(i->op) || rtype(r) != RTmp) + if (rtype(r) != RTmp) return 0; - if (i->op == Oextsw || i->op == Oextuw) - if (i->cls == Kw) - return 1; - t = &fn->tmp[r.val]; - assert(KBASE(t->cls) == 0); - if (i->cls == Kl && t->cls == Kw) + for (u = t->use; u < &t->use[t->nuse]; u++) { + switch (u->type) { + case UPhi: + p = u->u.phi; + if (p->visit) + continue; + p->visit = 1; + b = usewidthle(fn, p->to, wbits); + p->visit = 0; + if (b) + continue; + break; + case UIns: + i = u->u.ins; + assert(i != 0); + if (i->op == Ocopy) + if (usewidthle(fn, i->to, wbits)) + continue; + if (isext(i->op)) { + if (EXTW[i->op - Oextsb] <= wbits) + continue; + else + if (usewidthle(fn, i->to, wbits)) + continue;; + } + if (i->op == Oand) { + if (req(r, i->arg[0])) + rc = i->arg[1]; + else { + assert(req(r, i->arg[1])); + rc = i->arg[0]; + } + if (isconbits(fn, rc, &v)) + if (u64_wbits(v) <= wbits) + continue; + break; + } + if (isstore(i->op)) + if (req(r, i->arg[1])) + if (STW[i->op - Ostoreb] > wbits) + continue; + break; + default: + break; + } return 0; - b = extcpy[t->width]; - return (BIT(Wsb + (i->op-Oextsb)) & b) != 0; + } + return 1; } -static Ref -copyof(Ref r, Ref *cpy) +static Phi* +findphi(Fn *fn, uint bid, Ref to) { - if (rtype(r) == RTmp && !req(cpy[r.val], R)) - return cpy[r.val]; - return r; + Phi *p; + for (p = fn->rpo[bid]->phi; p; p = p->link) + if (req(p->to, to)) + break; + assert(p); + return p; } -/* detects a cluster of phis/copies redundant with 'r'; - * the algorithm is inspired by Section 3.2 of "Simple - * and Efficient SSA Construction" by Braun M. et al. - */ -static void -phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn) +static uint +uint_min(uint v1, uint v2) { - Use **stk, *u, *u1; - uint nstk, a; - int t; - Ref r1; - Phi *p0; - - bszero(ts); - bszero(as); - p0 = &(Phi){.narg = 0}; - stk = *pstk; - nstk = 1; - stk[0] = &(Use){.type = UPhi, .u.phi = p}; - while (nstk) { - u = stk[--nstk]; - if (u->type == UIns && iscopy(u->u.ins, r, fn)) { - p = p0; - t = u->u.ins->to.val; - } - else if (u->type == UPhi) { - p = u->u.phi; - t = p->to.val; - } - else - continue; - if (bshas(ts, t)) - continue; - bsset(ts, t); - for (a=0; anarg; a++) { - r1 = copyof(p->arg[a], cpy); - if (req(r1, r)) - continue; - if (rtype(r1) != RTmp) - return; - bsset(as, r1.val); + return v1 < v2 ? v1 : v2; +} + +/* is the ref def a narrow value? */ +static int +defwidthle(Fn *fn, Ref r, uint wbits) +{ + Tmp *t; + Phi *p; + Ins *i; + uint n; + int64_t v; + int x; + + if (isconbits(fn, r, &v)) + if (u64_wbits(v) <= wbits) + return 1; + if (rtype(r) != RTmp) + return 0; + t = &fn->tmp[r.val]; + if (t->cls != Kw) + return 0; + i = t->def; + if (i == 0) { + /* phi def */ + p = findphi(fn, t->bid, r); + if (p->visit) + return 1; + p->visit = 1; + for (n = 0; n < p->narg; n++) + if (!defwidthle(fn, p->arg[n], wbits)) { + p->visit = 0; + return 0; + } + p->visit = 0; + return 1; + } + /* ins def */ + if (i->op == Ocopy) + return defwidthle(fn, i->arg[0], wbits); + if (i->op == Oshr || i->op == Osar) { + if (isconbits(fn, i->arg[1], &v)) + if (0 < v && v <= 32) { + if (i->op == Oshr && 32-v <= wbits) + return 1; + if (0 <= v && v < 32 && wbits < 32) + return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v)); } - u = fn->tmp[t].use; - u1 = &u[fn->tmp[t].nuse]; - vgrow(pstk, nstk+(u1-u)); - stk = *pstk; - for (; uarg[0], wbits); + } + if (iscmp(i->op, &x, &x)) + return wbits >= 1; + if (i->op == Oand) + return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits); + if (i->op == Oor || i->op == Oxor) + return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits); + if (isext(i->op)) { + if (EXTSIGNED[i->op - Oextsb]) + return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb])); + if (EXTW[i->op - Oextsb] <= wbits) + return 1; + return defwidthle(fn, i->arg[0], wbits); } - bsdiff(as, ts); - if (!bscount(as)) - for (t=0; bsiter(ts, &t); t++) - cpy[t] = r; + return 0; +} + +/* is the ref a boolean - 0, 1 - value? */ +int +iswu1(Fn *fn, Ref r) +{ + return defwidthle(fn, r, 1); } -static void -subst(Ref *pr, Ref *cpy) +static int +isnarrowpar(Fn *fn, Ref r) { - assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R)); - *pr = copyof(*pr, cpy); + Tmp *t; + + if (rtype(r) != RTmp) + return 0; + t = &fn->tmp[r.val]; + if (t->bid != fn->start->id || t->def == 0) + return 0; + return ispar(t->def->op); } -/* requires use and dom, breaks use */ +/* Insert extub/extuh instructions in start for pars used only narrowly */ +/* needs use; breaks use */ void -copy(Fn *fn) +narrowpars(Fn *fn) { - BSet ts[1], as[1]; - Use **stk; - Phi *p, **pp; - Ins *i; Blk *b; - uint n, a, eq; - Ref *cpy, r, r1; - int t; - - bsinit(ts, fn->ntmp); - bsinit(as, fn->ntmp); - cpy = emalloc(fn->ntmp * sizeof cpy[0]); - stk = vnew(10, sizeof stk[0], PHeap); - - /* 1. build the copy-of map */ - for (n=0; nnblk; n++) { - b = fn->rpo[n]; - for (p=b->phi; p; p=p->link) { - assert(rtype(p->to) == RTmp); - if (!req(cpy[p->to.val], R)) - continue; - eq = 0; - r = R; - for (a=0; anarg; a++) - if (p->blk[a]->id < n) { - r1 = copyof(p->arg[a], cpy); - if (req(r, R) || req(r, UNDEF)) - r = r1; - if (req(r1, r) || req(r1, UNDEF)) - eq++; - } - assert(!req(r, R)); - if (rtype(r) == RTmp - && !dom(fn->rpo[fn->tmp[r.val].bid], b)) - cpy[p->to.val] = p->to; - else if (eq == p->narg) - cpy[p->to.val] = r; - else { - cpy[p->to.val] = p->to; - phisimpl(p, r, cpy, &stk, ts, as, fn); - } - } - for (i=b->ins; i<&b->ins[b->nins]; i++) { - assert(rtype(i->to) <= RTmp); - if (!req(cpy[i->to.val], R)) - continue; - r = copyof(i->arg[0], cpy); - if (iscopy(i, r, fn)) - cpy[i->to.val] = r; - else - cpy[i->to.val] = i->to; - } - } + int loop; + Ins *i, *ins; + uint npar, nins; + enum O extop; + Ref r; - /* 2. remove redundant phis/copies - * and rewrite their uses */ - for (b=fn->start; b; b=b->link) { - for (pp=&b->phi; (p=*pp);) { - r = cpy[p->to.val]; - if (!req(r, p->to)) { - *pp = p->link; - continue; - } - for (a=0; anarg; a++) - subst(&p->arg[a], cpy); - pp=&p->link; - } - for (i=b->ins; i<&b->ins[b->nins]; i++) { - r = cpy[i->to.val]; - if (!req(r, i->to)) { - *i = (Ins){.op = Onop}; - continue; - } - subst(&i->arg[0], cpy); - subst(&i->arg[1], cpy); + /* only useful for functions with loops */ + loop = 0; + for (b = fn->start; b; b = b->link) + if (b->loop > 1) { + loop = 1; + break; } - subst(&b->jmp.arg, cpy); + if (!loop) + return; + + b = fn->start; + npar = 0; + + for (i = b->ins; i < &b->ins[b->nins]; i++) { + if (!ispar(i->op)) + break; + npar++; } - if (debug['C']) { - fprintf(stderr, "\n> Copy information:"); - for (t=Tmp0; tntmp; t++) { - if (req(cpy[t], R)) { - fprintf(stderr, "\n%10s not seen!", - fn->tmp[t].name); - } - else if (!req(cpy[t], TMP(t))) { - fprintf(stderr, "\n%10s copy of ", - fn->tmp[t].name); - printref(cpy[t], fn, stderr); + if (npar == 0) + return; + + nins = b->nins + npar; + ins = vnew(nins, sizeof ins[0], PFn); //alloc(nins * sizeof ins[0]); + memcpy(ins, b->ins, npar * sizeof ins[0]); + memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]); + b->ins = ins; + b->nins = nins; + + for (i = b->ins; i < &b->ins[b->nins]; i++) { + if (!ispar(i->op)) + break; + extop = Onop; + if (i->cls == Kw) + if (usewidthle(fn, i->to, 16)) { + if (usewidthle(fn, i->to, 8)) + extop = Oextub; + else + extop = Oextuh; } + if (extop == Onop) { + *(i+npar) = (Ins) {.op = Onop}; + } else { + r = newtmp("vw", i->cls, fn); + *(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}}; + i->to = r; } - fprintf(stderr, "\n\n> After copy elimination:\n"); - printfn(fn, stderr); } - vfree(stk); - free(cpy); +} + +/* used by GVN */ +Ref +copyref(Fn *fn, Blk *b, Ins *i) +{ + static bits extcpy[] = { + [WFull] = 0, + [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw), + [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw), + [Wsh] = BIT(Wsh) | BIT(Wsw), + [Wuh] = BIT(Wuh) | BIT(Wuw), + [Wsw] = BIT(Wsw), + [Wuw] = BIT(Wuw), + }; + bits bext; + Tmp *t; + int64_t v; + int is0; + + if (i->op == Ocopy) + return i->arg[0]; + + /* op identity value */ + if (optab[i->op].hasid) + if (KBASE(i->cls) == 0) /* integer only - fp NaN! */ + if (req(i->arg[1], con01[optab[i->op].idval])) + if (!optab[i->op].cmpeqwl || iswu1(fn, i->arg[0])) + return i->arg[0]; + + /* idempotent op with identical args */ + if (optab[i->op].idemp) + if (req(i->arg[0], i->arg[1])) + return i->arg[0]; + + /* integer cmp with identical args */ + if (optab[i->op].cmpeqwl || optab[i->op].cmplgtewl) + if (req(i->arg[0], i->arg[1])) + return con01[optab[i->op].eqval]; + + /* cmpeq/ne 0 with 0/non-0 inference from dominating jnz */ + if (optab[i->op].cmpeqwl) + if (req(i->arg[1], con01[0])) + if (is0non0(fn, b, i->arg[0], argcls(i,0), &is0)) + return con01[optab[i->op].eqval^is0^1]; + + /* redundant and mask */ + if (i->op == Oand) + if (isconbits(fn, i->arg[1], &v)) + if (((v+1) & v) == 0) /* v == 2^N-1 */ + if (defwidthle(fn, i->arg[0], u64_wbits(v))) + return i->arg[0]; + + if (!isext(i->op) || rtype(i->arg[0]) != RTmp) + return R; + if (i->op == Oextsw || i->op == Oextuw) + if (i->cls == Kw) + return i->arg[0]; + + t = &fn->tmp[i->arg[0].val]; + assert(KBASE(t->cls) == 0); + if (i->cls == Kl && t->cls == Kw) + return R; + bext = extcpy[t->width]; + if ((BIT(Wsb + (i->op-Oextsb)) & bext) != 0) + return i->arg[0]; + + if (!isnarrowpar(fn, i->arg[0])) + if (usewidthle(fn, i->to, EXTW[i->op - Oextsb])) + return i->arg[0]; + if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb])) + return i->arg[0]; + + return R; } diff --git a/fold.c b/fold.c index 3ff0bc7..87e5824 100644 --- a/fold.c +++ b/fold.c @@ -1,22 +1,6 @@ #include "all.h" -enum { - Bot = -1, /* lattice bottom */ - Top = 0, /* lattice top (matches UNDEF) */ -}; - -typedef struct Edge Edge; - -struct Edge { - int dest; - int dead; - Edge *work; -}; - -static int *val; -static Edge *flowrk, (*edge)[2]; -static Use **usewrk; -static uint nuse; +/* boring folding code */ static int iscon(Con *c, int w, uint64_t k) @@ -29,304 +13,6 @@ iscon(Con *c, int w, uint64_t k) return (uint32_t)c->bits.i == (uint32_t)k; } -static int -latval(Ref r) -{ - switch (rtype(r)) { - case RTmp: - return val[r.val]; - case RCon: - return r.val; - default: - die("unreachable"); - } -} - -static int -latmerge(int v, int m) -{ - return m == Top ? v : (v == Top || v == m) ? m : Bot; -} - -static void -update(int t, int m, Fn *fn) -{ - Tmp *tmp; - uint u; - - m = latmerge(val[t], m); - if (m != val[t]) { - tmp = &fn->tmp[t]; - for (u=0; unuse; u++) { - vgrow(&usewrk, ++nuse); - usewrk[nuse-1] = &tmp->use[u]; - } - val[t] = m; - } -} - -static int -deadedge(int s, int d) -{ - Edge *e; - - e = edge[s]; - if (e[0].dest == d && !e[0].dead) - return 0; - if (e[1].dest == d && !e[1].dead) - return 0; - return 1; -} - -static void -visitphi(Phi *p, int n, Fn *fn) -{ - int v; - uint a; - - v = Top; - for (a=0; anarg; a++) - if (!deadedge(p->blk[a]->id, n)) - v = latmerge(v, latval(p->arg[a])); - update(p->to.val, v, fn); -} - -static int opfold(int, int, Con *, Con *, Fn *); - -static void -visitins(Ins *i, Fn *fn) -{ - int v, l, r; - - if (rtype(i->to) != RTmp) - return; - if (optab[i->op].canfold) { - l = latval(i->arg[0]); - if (!req(i->arg[1], R)) - r = latval(i->arg[1]); - else - r = CON_Z.val; - if (l == Bot || r == Bot) - v = Bot; - else if (l == Top || r == Top) - v = Top; - else - v = opfold(i->op, i->cls, &fn->con[l], &fn->con[r], fn); - } else - v = Bot; - /* fprintf(stderr, "\nvisiting %s (%p)", optab[i->op].name, (void *)i); */ - update(i->to.val, v, fn); -} - -static void -visitjmp(Blk *b, int n, Fn *fn) -{ - int l; - - switch (b->jmp.type) { - case Jjnz: - l = latval(b->jmp.arg); - if (l == Bot) { - edge[n][1].work = flowrk; - edge[n][0].work = &edge[n][1]; - flowrk = &edge[n][0]; - } - else if (iscon(&fn->con[l], 0, 0)) { - assert(edge[n][0].dead); - edge[n][1].work = flowrk; - flowrk = &edge[n][1]; - } - else { - assert(edge[n][1].dead); - edge[n][0].work = flowrk; - flowrk = &edge[n][0]; - } - break; - case Jjmp: - edge[n][0].work = flowrk; - flowrk = &edge[n][0]; - break; - case Jhlt: - break; - default: - if (isret(b->jmp.type)) - break; - die("unreachable"); - } -} - -static void -initedge(Edge *e, Blk *s) -{ - if (s) - e->dest = s->id; - else - e->dest = -1; - e->dead = 1; - e->work = 0; -} - -static int -renref(Ref *r) -{ - int l; - - if (rtype(*r) == RTmp) - if ((l=val[r->val]) != Bot) { - *r = CON(l); - return 1; - } - return 0; -} - -/* require rpo, use, pred */ -void -fold(Fn *fn) -{ - Edge *e, start; - Use *u; - Blk *b, **pb; - Phi *p, **pp; - Ins *i; - int t, d; - uint n, a; - - val = emalloc(fn->ntmp * sizeof val[0]); - edge = emalloc(fn->nblk * sizeof edge[0]); - usewrk = vnew(0, sizeof usewrk[0], PHeap); - - for (t=0; tntmp; t++) - val[t] = Top; - for (n=0; nnblk; n++) { - b = fn->rpo[n]; - b->visit = 0; - initedge(&edge[n][0], b->s1); - initedge(&edge[n][1], b->s2); - } - initedge(&start, fn->start); - flowrk = &start; - nuse = 0; - - /* 1. find out constants and dead cfg edges */ - for (;;) { - e = flowrk; - if (e) { - flowrk = e->work; - e->work = 0; - if (e->dest == -1 || !e->dead) - continue; - e->dead = 0; - n = e->dest; - b = fn->rpo[n]; - for (p=b->phi; p; p=p->link) - visitphi(p, n, fn); - if (b->visit == 0) { - for (i=b->ins; i<&b->ins[b->nins]; i++) - visitins(i, fn); - visitjmp(b, n, fn); - } - b->visit++; - assert(b->jmp.type != Jjmp - || !edge[n][0].dead - || flowrk == &edge[n][0]); - } - else if (nuse) { - u = usewrk[--nuse]; - n = u->bid; - b = fn->rpo[n]; - if (b->visit == 0) - continue; - switch (u->type) { - case UPhi: - visitphi(u->u.phi, u->bid, fn); - break; - case UIns: - visitins(u->u.ins, fn); - break; - case UJmp: - visitjmp(b, n, fn); - break; - default: - die("unreachable"); - } - } - else - break; - } - - if (debug['F']) { - fprintf(stderr, "\n> SCCP findings:"); - for (t=Tmp0; tntmp; t++) { - if (val[t] == Bot) - continue; - fprintf(stderr, "\n%10s: ", fn->tmp[t].name); - if (val[t] == Top) - fprintf(stderr, "Top"); - else - printref(CON(val[t]), fn, stderr); - } - fprintf(stderr, "\n dead code: "); - } - - /* 2. trim dead code, replace constants */ - d = 0; - for (pb=&fn->start; (b=*pb);) { - if (b->visit == 0) { - d = 1; - if (debug['F']) - fprintf(stderr, "%s ", b->name); - edgedel(b, &b->s1); - edgedel(b, &b->s2); - *pb = b->link; - continue; - } - for (pp=&b->phi; (p=*pp);) - if (val[p->to.val] != Bot) - *pp = p->link; - else { - for (a=0; anarg; a++) - if (!deadedge(p->blk[a]->id, b->id)) - renref(&p->arg[a]); - pp = &p->link; - } - for (i=b->ins; i<&b->ins[b->nins]; i++) - if (renref(&i->to)) - *i = (Ins){.op = Onop}; - else { - for (n=0; n<2; n++) - renref(&i->arg[n]); - if (isstore(i->op)) - if (req(i->arg[0], UNDEF)) - *i = (Ins){.op = Onop}; - } - renref(&b->jmp.arg); - if (b->jmp.type == Jjnz && rtype(b->jmp.arg) == RCon) { - if (iscon(&fn->con[b->jmp.arg.val], 0, 0)) { - edgedel(b, &b->s1); - b->s1 = b->s2; - b->s2 = 0; - } else - edgedel(b, &b->s2); - b->jmp.type = Jjmp; - b->jmp.arg = R; - } - pb = &b->link; - } - - if (debug['F']) { - if (!d) - fprintf(stderr, "(none)"); - fprintf(stderr, "\n\n> After constant folding:\n"); - printfn(fn, stderr); - } - - free(val); - free(edge); - vfree(usewrk); -} - -/* boring folding code */ - static int foldint(Con *res, int op, int w, Con *cl, Con *cr) { @@ -516,7 +202,7 @@ foldflt(Con *res, int op, int w, Con *cl, Con *cr) } } -static int +static Ref opfold(int op, int cls, Con *cl, Con *cr, Fn *fn) { Ref r; @@ -524,12 +210,37 @@ opfold(int op, int cls, Con *cl, Con *cr, Fn *fn) if (cls == Kw || cls == Kl) { if (foldint(&c, op, cls == Kl, cl, cr)) - return Bot; + return R; } else foldflt(&c, op, cls == Kd, cl, cr); if (!KWIDE(cls)) c.bits.i &= 0xffffffff; r = newcon(&c, fn); assert(!(cls == Ks || cls == Kd) || c.flt); - return r.val; + return r; +} + +/* used by GVN */ +Ref +foldref(Fn *fn, Ins *i) +{ + Ref rr; + Con *cl, *cr; + + if (rtype(i->to) != RTmp) + return R; + if (optab[i->op].canfold) { + if (rtype(i->arg[0]) != RCon) + return R; + cl = &fn->con[i->arg[0].val]; + rr = i->arg[1]; + if (req(rr, R)) + rr = CON_Z; + if (rtype(rr) != RCon) + return R; + cr = &fn->con[rr.val]; + + return opfold(i->op, i->cls, cl, cr, fn); + } + return R; } diff --git a/gcm.c b/gcm.c new file mode 100644 index 0000000..515d41f --- /dev/null +++ b/gcm.c @@ -0,0 +1,507 @@ +#include "all.h" + +#define NOBID (-1u) + +/* ins can trap at runtime */ +static int +istrapping(Fn *fn, Ins *i) +{ + int64_t v; + + if (KBASE(i->cls) == 0) + if (INRANGE(i->op, Odiv, Ourem)) + if (!isconbits(fn, i->arg[1], &v) || v == 0) + return 1; + return 0; +} + +/* fixed ins that can be eliminated if unused */ +static int +canelim(Fn *fn, Ins *i) +{ + return isload(i->op) || isalloc(i->op) || istrapping(fn, i); +} + +/* ins must stay in def blk */ +int +isfixed(Fn *fn, Ins *i) +{ + return optab[i->op].ispinned || istrapping(fn, i); +} + +static uint earlyins(Fn *, Blk *, Ins *); + +/* return earlybid of ref def ins */ +static uint +schedearly(Fn *fn, Ref r) +{ + Tmp *t; + Blk *b; + + if (rtype(r) != RTmp) + return 0 /* root/start */; + + t = &fn->tmp[r.val]; + if (t->gcmbid != NOBID) + return t->gcmbid; /* already visited/visiting */ + + b = fn->rpo[t->bid]; + if (t->def) { + /* def is an ins */ + assert(b->ins <= t->def && t->def < &b->ins[b->nins]); + t->gcmbid = 0; /* mark visiting root/start blk */ + t->gcmbid = earlyins(fn, b, t->def); /* schedule ins input defs */ + } else { + /* def is a phi - it stays in its def blk */ + t->gcmbid = t->bid; + } + + return t->gcmbid; +} + +/* return deepest domdpth bid of arg defs */ +static uint +earlyins(Fn *fn, Blk *b, Ins* i) +{ + uint earlybid, arg1earlybid; + + earlybid = schedearly(fn, i->arg[0]); + assert(earlybid != NOBID); + arg1earlybid = schedearly(fn, i->arg[1]); + assert(arg1earlybid != NOBID); + if (fn->rpo[earlybid]->domdpth < fn->rpo[arg1earlybid]->domdpth) { + assert(dom(fn->rpo[earlybid], fn->rpo[arg1earlybid])); + earlybid = arg1earlybid; + } + + /* fixed ins remain in their defining blk */ + return isfixed(fn, i) ? b->id : earlybid; +} + +static void +earlyblk(Fn *fn, uint bid) +{ + Blk *b; + Phi *p; + Ins *i; + uint n; + + b = fn->rpo[bid]; + for (p = b->phi; p; p = p->link) + for (n = 0; n < p->narg; n++) + schedearly(fn, p->arg[n]); + for (i = b->ins; i < &b->ins[b->nins]; i++) + if (isfixed(fn, i)) + earlyins(fn, b, i); + schedearly(fn, b->jmp.arg); +} + +/* least common ancestor in dom tree */ +static uint +lcabid(Fn *fn, uint bid1, uint bid2) +{ + Blk *b; + + if (bid1 == NOBID) + return bid2; + if (bid2 == NOBID) + return bid1; + + b = lca(fn->rpo[bid1], fn->rpo[bid2]); + assert(b); + return b->id; +} + +static uint +bestbid(Fn *fn, uint earlybid, uint latebid) +{ + Blk *curb, *earlyb, *bestb; + + if (latebid == NOBID) + return NOBID; /* unused */ + + assert(earlybid != NOBID); + + earlyb = fn->rpo[earlybid]; + bestb = curb = fn->rpo[latebid]; + assert(dom(earlyb, curb)); + + while (curb != earlyb) { + curb = curb->idom; + if (curb->loop < bestb->loop) + bestb = curb; + } + return bestb->id; +} + +static uint lateins(Fn *, Blk *, Ins *, Ref r); +static uint latephi(Fn *, Phi *, Ref r); +static uint latejmp(Blk *, Ref r); + +/* return lca bid of ref uses */ +static uint +schedlate(Fn *fn, Ref r) +{ + Tmp *t; + Blk *b; + Use *u; + uint earlybid; + uint latebid; + uint uselatebid; + + if (rtype(r) != RTmp) + return NOBID; + + t = &fn->tmp[r.val]; + if (t->visit) + return t->gcmbid; /* already visited/visiting */ + + t->visit = 1; /* mark visiting/visited */ + earlybid = t->gcmbid; + if (earlybid == NOBID) + return NOBID; /* not used */ + t->gcmbid = t->bid; /* t->gcmbid is now latebid */ + + latebid = NOBID; + for (u = t->use; u < &t->use[t->nuse]; u++) { + assert(u->bid < fn->nblk); + b = fn->rpo[u->bid]; + switch (u->type) { + case UXXX: + die("unreachable"); + break; + case UPhi: + uselatebid = latephi(fn, u->u.phi, r); + break; + case UIns: + uselatebid = lateins(fn, b, u->u.ins, r); + break; + case UJmp: + uselatebid = latejmp(b, r); + break; + } + latebid = lcabid(fn, latebid, uselatebid); + } + + /* phis stay in their def blk */ + if (t->def) { + /* allow elimination of unused load/alloc/trapping ins */ + if (latebid == NOBID && canelim(fn, t->def)) + t->gcmbid = NOBID; + /* ... otherwise fixed ins stay in defining blk */ + else if(!isfixed(fn, t->def)) + t->gcmbid = bestbid(fn, earlybid, latebid); + } + + return t->gcmbid; +} + +/* return lca bid of uses, or NOBID if no active uses */ +static uint +lateins(Fn *fn, Blk *b, Ins *i, Ref r) +{ + uint latebid; + + assert(i->op == Onop || req(i->arg[0], r) || req(i->arg[1], r)); + if (i->op == Onop) + return NOBID; /* already eliminated */ + + assert(b->ins <= i && i < &b->ins[b->nins]); + + latebid = schedlate(fn, i->to); + /* allow elimination of unused load/alloc/trapping ins */ + if (latebid == NOBID) + if (canelim(fn, i)) + return NOBID; + /* ... otherwise fixed ins stay in defining blk */ + return isfixed(fn, i) ? b->id : latebid; +} + +static uint +latephi(Fn *fn, Phi *p, Ref r) +{ + uint n; + uint latebid; + + if (!p->narg) + return NOBID; /* marked as unused */ + + latebid = NOBID; + for (n = 0; n < p->narg; n++) + if (req(p->arg[n], r)) + latebid = lcabid(fn, latebid, p->blk[n]->id); + + assert(latebid != NOBID); + return latebid; +} + +static uint +latejmp(Blk *b, Ref r) +{ + if (req(b->jmp.arg, R)) + return NOBID; + else { + assert(req(b->jmp.arg, r)); + return b->id; + } +} + +static void +lateblk(Fn *fn, uint bid) +{ + Blk *b; + Phi **pp; + Ins *i; + + b = fn->rpo[bid]; + for (pp = &b->phi; *(pp);) + if (schedlate(fn, (*pp)->to) == NOBID) { + /* unused */ + (*pp)->narg = 0; /* mark unused */ + *pp = (*pp)->link; /* remove phi */ + } else + pp = &(*pp)->link; + + for (i = b->ins; i < &b->ins[b->nins]; i++) + if (isfixed(fn, i)) + lateins(fn, b, i, i->arg[0]); +} + +static void +addgcmins(Fn *fn, Ins *vins, uint nins) +{ + Ins *i; + Tmp *t; + Blk *b; + + for (i = vins; i < &vins[nins]; i++) { + assert(rtype(i->to) == RTmp); + t = &fn->tmp[i->to.val]; + b = fn->rpo[t->gcmbid]; + addins(&b->ins, &b->nins, i); + } +} + +/* remove unused instructions */ +/* move instructions to (end of) target block */ +/* use-before-def is fixed up afterwards */ +static void +gcmmove(Fn *fn) +{ + Tmp *t; + Ins *vins, *i; + uint nins; + + nins = 0; + vins = vnew(nins, sizeof vins[0], PFn); + + for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) { + if (t->def == 0) + continue; + if (t->bid == t->gcmbid) + continue; + i = t->def; + if (isfixed(fn, i) && !canelim(fn, i)) + continue; + assert(rtype(i->to) == RTmp); + assert(t == &fn->tmp[i->to.val]); + if (t->gcmbid != NOBID) + addins(&vins, &nins, i); + *i = (Ins){.op = Onop}; + } + + addgcmins(fn, vins, nins); +} + +static void +schedins(Fn *fn, Blk *b, Ins *i0, Ins **pvins, uint *pnins) +{ + Ins *i, *i1; + Tmp *t; + uint na; + + if (i0->op == Onop) + return; + /* arg...call have to stay together */ + /* TODO - sel0...sel1 too */ + for (i1 = i0; isarg(i1->op); i1++) {} + for (i = i0; i <= i1; i++) + for (na = 0; na < 2; na++) { + if (rtype(i->arg[na]) != RTmp) + continue; + t = &fn->tmp[i->arg[na].val]; + /* def in different blk, or phi in this blk */ + if (t->bid != b->id || t->def == 0) + continue; + /* already scheduled */ + if (t->def->op == Onop) { + continue; + } + schedins(fn, b, t->def, pvins, pnins); + } + for (i = i0; i <= i1; i++) { + addins(pvins, pnins, i); + *i = (Ins){.op = Onop}; + } +} + +static void +fixbub4d(Fn *fn, Blk *b, Ins **pvins) +{ + Ins *i; + uint nins; + + nins = 0; + for (i = b->ins; i < &b->ins[b->nins]; i++) + schedins(fn, b, i, pvins, &nins); + idup(b, *pvins, nins); +} + +static void +fixub4d(Fn *fn) +{ + Blk *b; + Ins *vins; // TODO insb + + vins = vnew(0, sizeof vins[0], PFn); + for (b = fn->start; b; b = b->link) + fixbub4d(fn, b, &vins); +} + +static int +iskladdcon(Fn *fn, Ins *i, int64_t *v) +{ + if (i->op == Oadd) + if (i->cls == Kl) + if (rtype(i->arg[0]) == RTmp) + if (isconbits(fn, i->arg[1], v)) + return 1; + return 0; +} + +static void +ireassoc(Fn *fn, Blk *b, Ins *i, Ins **pvins, uint *pnins) +{ + Blk *b2; + Tmp *t, *t2; + Use *u; + Ref r2; + int64_t v; + int x; + + assert(b->ins <= i && i < &b->ins[b->nins]); + if (!iscmp(i->op, &x, &x)) + if (!iskladdcon(fn, i, &v)) + return; + + assert(rtype(i->to) == RTmp); + t = &fn->tmp[i->to.val]; + for (u = t->use; u < &t->use[t->nuse]; u++) { + if (u->type == UPhi) + continue; + b2 = fn->rpo[u->bid]; + addins(pvins, pnins, i); + /* careful, can move fn->tmp */ + r2 = newtmp("rea", t->cls, fn); + t = &fn->tmp[i->to.val]; + t2 = &fn->tmp[r2.val]; + t2->gcmbid = u->bid; + (*pvins)[(*pnins)-1].to = r2; + if (u->type == UIns) { + assert(req(u->u.ins->arg[0], i->to) + || req(u->u.ins->arg[1], i->to)); + if (req(u->u.ins->arg[0], i->to)) + u->u.ins->arg[0] = r2; + if (req(u->u.ins->arg[1], i->to)) + u->u.ins->arg[1] = r2; + } else { + assert(u->type == UJmp); + assert(req(b2->jmp.arg, i->to)); + b2->jmp.arg = r2; + } + } +} + +/* Redistribute trivial ops to point of use. */ +/* Reduces register pressure. */ +/* needs rpo, use; breaks use */ +void +reassoc(Fn *fn) +{ + Blk *b; + Ins *vins, *i; + uint nins; + + nins = 0; + vins = vnew(nins, sizeof vins[0], PFn); + + /* identify trivial ins */ + for (b = fn->start; b; b = b->link) { + for (i = b->ins; i < &b->ins[b->nins]; i++) + ireassoc(fn, b, i, &vins, &nins); + } + + addgcmins(fn, vins, nins); +} + +static void +cleartmps(Fn *fn) +{ + Tmp *t; + + for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) { + t->visit = 0; + t->gcmbid = NOBID; + t->gcminsn = -1u; // TODO - get rid + } +} + +/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */ +/* require use dom */ +/* maintains rpo pred dom */ +/* breaks use */ +void +gcm(Fn *fn) +{ + uint bid; + + /* fprintf(stderr, "\n\nBefore gcm:\n\n"); */ + /* printfn(fn, stderr); */ + + filldomdpth(fn); + fillloop(fn); + + cleartmps(fn); + for (bid=0; bidnblk; bid++) + earlyblk(fn, bid); + for (bid=0; bidnblk; bid++) + lateblk(fn, bid); + + /* fprintf(stderr, "\n\nBefore gcmmove/fixub4d:\n\n"); */ + /* printfn(fn, stderr); */ + + gcmmove(fn); + cleartmps(fn); + /* filluse(fn); */ + /* fixub4d(fn); */ + + /* fprintf(stderr, "\n\nAfter gcmmove/fixub4d:\n\n"); */ + /* printfn(fn, stderr); */ + + //fillcfg(fn); + filluse(fn); + reassoc(fn); + + /* cleartmps(fn); */ + filluse(fn); + fixub4d(fn); + + /* delete (now) unused ins - already done later??? */ + filluse(fn); + nopunused(fn); + + if (debug['G']) { + fprintf(stderr, "\n> After GCM:\n"); + printfn(fn, stderr); + } +} diff --git a/gvn.c b/gvn.c new file mode 100644 index 0000000..20c8edd --- /dev/null +++ b/gvn.c @@ -0,0 +1,748 @@ +#include "all.h" + +/* literal constants 0, 1 */ +Ref con01[2]; + +static inline uint +mix(uint x0, uint x1) +{ + return x0 + 17*x1; +} + +static inline uint +rhash(Ref r) +{ + return mix(r.type, r.val); +} + +static uint +ihash(Ins *i) +{ + uint a0h, a1h, ah, h; + + a0h = rhash(i->arg[0]); + a1h = rhash(i->arg[1]); + ah = mix(a0h, a1h); + h = mix(i->cls, i->op); + h = mix(h, ah); + + return h; + +} + +static int +ieq(Ins *ia, Ins *ib) +{ + if (ia->cls == ib->cls) + if (ia->op == ib->op) + if (req(ia->arg[0], ib->arg[0])) + if (req(ia->arg[1], ib->arg[1])) + return 1; + return 0; +} + +static Ins** gvntbl; +static uint gvntbln; +static uint lookupn; +static uint proben; +static uint maxproben; + +static Ins * +gvndup(Ins *i, int insert) { + uint idx, n; + Ins *i2; + + lookupn++; + + idx = ihash(i) % gvntbln; + for (n = 1;; n++) { + proben++; + if (n > maxproben) + maxproben = n; + i2 = gvntbl[idx]; + if (!i2) + break; + if (ieq(i, i2)) + return i2; + + idx++; + if (gvntbln <= idx) + idx = 0; + } + /* not found */ + if (insert) { + gvntbl[idx] = i; + } + return 0; +} + +static void +replaceref(Ref *r, Ref r1, Ref r2) +{ + if (req(*r, r1)) + *r = r2; +} + +static void +replacepuse(Phi *p, Ref r1, Ref r2) +{ + Ref *a; + + for (a = p->arg; a < &p->arg[p->narg]; a++) + replaceref(a, r1, r2); +} + +static void +replaceiuse(Ins *i, Ref r1, Ref r2) +{ + replaceref(&i->arg[0], r1, r2); + replaceref(&i->arg[1], r1, r2); +} + +static void +replacejuse(Blk* b, Ref r1, Ref r2) +{ + replaceref(&b->jmp.arg, r1, r2); +} + +static void +replaceuse(Fn *fn, Use* u, Ref r1, Ref r2) +{ + Tmp *t2; + Blk *b; + + t2 = rtype(r2) == RTmp ? &fn->tmp[r2.val] : 0; + b = fn->rpo[u->bid]; + + switch (u->type) { + case UXXX: + die("unreachable"); + break; + case UPhi: + replacepuse(u->u.phi, r1, r2); + if (t2) + adduse(t2, UPhi, b, u->u.phi); + break; + case UIns: + replaceiuse(u->u.ins, r1, r2); + if (t2) + adduse(t2, UIns, b, u->u.ins); + break; + case UJmp: + replacejuse(fn->rpo[u->bid], r1, r2); + if (t2) + adduse(t2, UJmp, b); + break; + } +} + +static void +replaceuses(Fn *fn, Ref r1, Ref r2) +{ + Tmp *t1; + Use *u; + + assert(rtype(r1) == RTmp); + t1 = &fn->tmp[r1.val]; + + for (u = t1->use; u < &t1->use[t1->nuse]; u++) + replaceuse(fn, u, r1, r2); + + t1->nuse = 0; +} + +static void +rmuse(Fn *fn, Blk *b, uint ty, Ins *i, Phi *p, Ref r, int strict) +{ + Tmp *t; + Use *u; + int found, rm; + + if (rtype(r) != RTmp) + return; + found = 0; + t = &fn->tmp[r.val]; + for (u = t->use; u < &t->use[t->nuse];) { + rm = 0; + if (u->type == ty) { + switch (ty) { + case UXXX: + die("unreachable"); + break; + case UIns: + assert(p == 0); + assert(i); + rm = u->u.ins == i; + break; + case UPhi: + assert(i == 0); + assert(p); + rm = u->u.phi == p; + break; + case UJmp: + assert(i == 0); + assert(p == 0); + rm = u->bid == b->id; + break; + default: + die("unreachable"); + break; + } + } + if (rm) { + found++; + assert(u < &t->use[t->nuse]); + assert(u->bid == b->id); + /* careful - deleting below iterator */ + memcpy(u, u+1, ((t->nuse) - ((u+1)-t->use)) * sizeof u[0]); + t->nuse--; + } else + u++; + } + if (strict) + assert(found); +} + +static int +phieq(Phi *pa, Phi *pb) +{ + uint n; + + assert(pa->narg == pb->narg); + for (n=0; nnarg; n++) { + if (!req(pa->arg[n], phiarg(pb, pa->blk[n]))) + return 0; + } + return 1; +} + +static void +killphi(Fn *fn, Blk *b, Phi **pp, Ref r) +{ + Phi *p; + uint n; + + p = *pp; + replaceuses(fn, p->to, r); + assert(b->npred == p->narg); + for (n = 0; n < p->narg; n++) + rmuse(fn, b, UPhi, 0, p, p->arg[n], 0/*!strict TODO dups*/); + p->narg = 0; /* mark as unused - TODO should not be necessary with rmuse*/ + *pp = p->link; +} + +static void +ifphiopt(Fn *fn, Blk *b) +{ + Blk *ifb, *thenb, *elseb, *tmp; + Phi *p, **pp; + Tmp *t; + Ref argt, argf, r1; + int k, x, caninv; + + /* is b the join blk of an if[-then][-else] graphlet? */ + if (!ifjoin(b, &ifb, &thenb, &elseb)) + return; + + assert(ifb->jmp.type == Jjnz); + assert(ifb->s1 == b || (ifb->s1 == thenb && thenb->s1 == b)); + assert(ifb->s2 == b || (ifb->s2 == elseb && elseb->s1 == b)); + if (!iswu1(fn, ifb->jmp.arg)) + return; /* no opportunity */ + + caninv = 1; + for (p = b->phi; p; p = p->link) + if (req(phiarg(p, elseb), con01[0])) { + caninv = 0; + break; + } + + /* phi bool 1/0 "copy" */ + for (pp = &b->phi; *pp;) { + p = *pp; + assert(p->narg == 2); + argt = phiarg(p, thenb); + argf = phiarg(p, elseb); + /* jnz jmp.arg in phi is 1/0 */ + if (req(argt, ifb->jmp.arg)) { + if (!req(argf, ifb->jmp.arg)) + rmuse(fn, b, UPhi, 0, p, argt, 1/*strict*/); + argt = p->arg[phiargn(p, thenb)] = con01[1]; + } + if (req(argf, ifb->jmp.arg)) { + rmuse(fn, b, UPhi, 0, p, argf, 1/*strict*/); + argf = p->arg[phiargn(p, elseb)] = con01[0]; + } + /* prefer 0 as argf and/or 1 as argt, so try to invert the cmp */ + if (caninv && + ((req(argt, con01[0]) && !req(argf, con01[0])) || + (req(argf, con01[1]) && !req(argt, con01[1])))) { + assert(rtype(ifb->jmp.arg) == RTmp); + t = &fn->tmp[ifb->jmp.arg.val]; + if (t->nuse == 1) + if (t->def && iscmp(t->def->op, &k, &x) && KBASE(k) == 0) { + assert(t->use[0].type == UJmp); + assert(t->bid == ifb->id); + t->def->op = invcmpwl(t->def->op); + tmp = ifb->s1; + ifb->s1 = ifb->s2; + ifb->s2 = tmp; + r1 = argt; argt = argf; argf = r1; + caninv = 0; /* only once */ + } + } + if (req(argt, con01[1]) && req(argf, con01[0])) { + killphi(fn, b, pp, ifb->jmp.arg); + caninv = 0; /* used already */ + continue; + } + pp = &(*pp)->link; + } +} + +/* phi "copy" - singleton or all args identical */ +static void +copyphielim(Fn *fn, Blk *b) { + Phi *p, **pp; + uint n; + + for (pp = &b->phi; *pp;) { + p = *pp; + assert(p->narg == b->npred); + for (n = 0; n < p->narg-1; n++) { + if (!req(p->arg[n], p->arg[n+1])) + goto Skip; + } + killphi(fn, b, pp, p->arg[0]); + continue; + Skip:; + pp = &(*pp)->link; + } +} + +/* redundant phis */ +static void +dupphielim(Fn *fn, Blk *b) { + Phi *p, *p2, **pp; + + /* O(N^2.M) but meh */ + for (pp = &b->phi; *pp;) { + p = *pp; + assert(p->narg == b->npred); + for (p2 = p->link; p2; p2 = p2->link) { + assert(p2->narg == b->npred); + if (phieq(p, p2)) { + killphi(fn, b, pp, p2->to); + goto Skip; + } + } + pp = &(*pp)->link; + Skip:; + } + return; +} + +static void +dedupphis(Fn *fn, Blk *b) { + ifphiopt(fn, b); + copyphielim(fn,b); + dupphielim(fn, b); +} + +static int +rcmp(Ref a, Ref b) +{ + if (rtype(a) != rtype(b)) + return rtype(a)-rtype(b); + return a.val - b.val; +} + +static void +normins(Fn *fn, Ins *i) +{ + uint n; + int64_t v; + Ref r; + + /* truncate constant bits to 32 bits for "s"/w" uses */ + for (n = 0; n < 2; n++) { + if (!KWIDE(argcls(i, n))) + if (isconbits(fn, i->arg[n], &v)) + if ((v & 0xffffffff) != v) + i->arg[n] = getcon(v & 0xffffffff, fn); + } + /* order arg[0] <= arg[1] for commutative ops, preferring RTmp in arg[0] */ + if (optab[i->op].commutes) + if (rcmp(i->arg[0], i->arg[1]) > 0) { + r = i->arg[1]; + i->arg[1] = i->arg[0]; + i->arg[0] = r; + } +} + +static Ref +negcon(Fn *fn, int cls, Ref r) +{ + int64_t v, v1; + + assert(isconbits(fn, r, &v)); + assert(KBASE(cls) == 0); + v1 = -v; + if (cls == Kw) + v1 = ((int64_t)(-(int32_t)v)) & 0xffffffff; + return getcon(v1, fn); +} + +static void +assoccon(Fn *fn, Blk *b, Ins *i1) +{ + Tmp *t2; + Ins *i2, i; + Ref r0, r1, rc; + int op1, op2; + int64_t v1, v2, vc; + + op1 = i1->op; + if (op1 == Osub) + op1 = Oadd; + if (!optab[op1].assoc || KBASE(i1->cls) != 0 || rtype(i1->arg[0]) != RTmp + || !isconbits(fn, i1->arg[1], &v1) || (optab[op1].hasid && v1 == optab[op1].idval)) + return; + t2 = &fn->tmp[i1->arg[0].val]; + if (t2->def == 0) + return; + i2 = t2->def; + op2 = i2->op; + if (op2 == Osub) + op2 = Oadd; + if (op1 != op2 || rtype(i2->arg[0]) != RTmp || !isconbits(fn, i2->arg[1], &v2)) + return; + assert(KBASE(i2->cls) == 0); + assert(i1->cls == Kl || i2->cls == Kw); + r0 = i1->arg[1]; + if (i1->op == Osub) + r0 = negcon(fn, i1->cls, r0); + r1 = i2->arg[1]; + if (i2->op == Osub) + r1 = negcon(fn, i2->cls, r1); + i = (Ins){.to = i2->to, .op = op2, .cls = i2->cls, .arg = {r0, r1}}; + rc = foldref(fn, &i); + assert(isconbits(fn, rc, &vc)); + if (op1 == Oadd) { + if (i2->cls == Kw) { + if ((int32_t)vc < 0) { + op1 = Osub; + rc = negcon(fn, Kw, rc); + } + } else { + assert(i2->cls == Kl); + if (vc < 0) { + op1 = Osub; + rc = negcon(fn, Kl, rc); + } + } + } + i1->op = op1; + rmuse(fn, b, UIns, i1, 0, i1->arg[0], 1/*strict*/); + i1->arg[0] = i2->arg[0]; + adduse(&fn->tmp[i1->arg[0].val], UIns, b, i1); + i1->arg[1] = rc; +} + +static void +killins(Fn *fn, Blk *b, Ins *i, Ref r1, Ref r2) +{ + replaceuses(fn, r1, r2); + rmuse(fn, b, UIns, i, 0, i->arg[0], 1/*strict*/); + if (!req(i->arg[0], i->arg[1])) + rmuse(fn, b, UIns, i, 0, i->arg[1], 1/*strict*/); + *i = (Ins){.op = Onop}; +} + +static void +dedupi(Fn *fn, Blk *b, Ins *i) +{ + Ref r2; + Ins *i2; + + if (i->op == Onop || i->op == Odbgloc) + return; + + normins(fn, i); + + if (optab[i->op].ispinned) + return; + assert(rtype(i->to) == RTmp); + + /* merge associative ops with constant arg[1] */ + assoccon(fn, b, i); + + /* effective copy? */ + r2 = copyref(fn, b, i); + if (!req(r2, R)) { + killins(fn, b, i, i->to, r2); + return; + } + + /* effective constant? */ + r2 = foldref(fn, i); + if (!req(r2, R)) { + killins(fn, b, i, i->to, r2); + return; + } + + /* do not dedup (trapping) ins that GCM will not move */ + if (isfixed(fn, i)) + return; + + /* duplicate? */ + i2 = gvndup(i, 1); + if (i2) { + killins(fn, b, i, i->to, i2->to); + return; + } +} + +static void +dedupins(Fn *fn, Blk *b) +{ + Ins *i; + + for (i = b->ins; i < &b->ins[b->nins]; i++) + dedupi(fn, b, i); +} + +int +cmpeq0def(Fn *fn, Ref r, Ref *arg0, int *cls, int *eqval) +{ + Ins *i; + + if (rtype(r) != RTmp) + return 0; + i = fn->tmp[r.val].def; + if (i) + if (optab[i->op].cmpeqwl) + if (req(i->arg[1], con01[0])) { + *arg0 = i->arg[0]; + *cls = argcls(i, 0); + *eqval = optab[i->op].eqval; + return 1; + } + return 0; +} + +static int +branchdom(Fn *fn, Blk *bif, Blk *bbr1, Blk *bbr2, Blk *b) +{ + if (bif->jmp.type == Jjnz) + if (b != bif) + if (dom(bbr1, b)) + if (!reachesnotvia(fn, bbr2, b, bif)) + return 1; + return 0; +} + +static int +dom0non0(Fn *fn, Blk *bdom, Blk *b, int *is0) +{ + if (branchdom(fn, bdom, bdom->s1, bdom->s2, b)) { + *is0 = 0; + return 1; + } + if (branchdom(fn, bdom, bdom->s2, bdom->s1, b)) { + *is0 = 1; + return 1; + } + return 0; +} + +/* infer 0/non-0 value from dominating jnz */ +int +is0non0(Fn *fn, Blk *b, Ref r, int cls, int *is0) +{ + Blk *bdom; + Ref arg0; + int cls1, eqval; + + for (bdom = b->idom; bdom; bdom = bdom->idom) { + if (bdom->jmp.type != Jjnz) + continue; + if (cls == Kw) + if (req(r, bdom->jmp.arg)) + if (dom0non0(fn, bdom, b, is0)) + return 1; + if (cmpeq0def(fn, bdom->jmp.arg, &arg0, &cls1, &eqval)) + if (cls == cls1) + if (req(r, arg0)) + if (dom0non0(fn, bdom, b, is0)) { + *is0 = *is0 ^ eqval; + return 1; + } + } + return 0; +} + +static int +usecls(Use *u, Ref r) +{ + switch (u->type) { + case UXXX: break; + case UIns: + /* safe to take arg[0] cls if both args == r, even for store */ + if (req(u->u.ins->arg[0], r)) + return argcls(u->u.ins, 0); + if (req(u->u.ins->arg[1], r)) + return argcls(u->u.ins, 1); + break; + case UPhi: return u->u.phi->cls; + case UJmp: return Kw; + } + die("unreachable"); +} + +static void +propjnz0(Fn *fn, Blk *bif, Blk *s0, Blk *snon0, Ref r, int cls) +{ + Blk *b; + Tmp *t; + Use *u; + + if (s0->npred != 1 || rtype(r) != RTmp) + return; + t = &fn->tmp[r.val]; + for (u = t->use; u < &t->use[t->nuse]; u++) { + b = fn->rpo[u->bid]; + if (usecls(u, r) == cls) + if (branchdom(fn, bif, s0, snon0, b)) + replaceuse(fn, u, r, con01[0]); + } +} + +static void +dedupjmp(Fn *fn, Blk *b) { + Blk *s1s2[2]; + int64_t v; + Ref arg0; + int cls, eqval, is0; + + if (b->jmp.type != Jjnz) + return; + + /* propagate jmp arg as 0 thru s2 */ + propjnz0(fn, b, b->s2, b->s1, b->jmp.arg, Kw); + /* propagate cmp eq/ne 0 def of jmp arg as 0 */ + s1s2[0] = b->s1; s1s2[1] = b->s2; + if (cmpeq0def(fn, b->jmp.arg, &arg0, &cls, &eqval)) + propjnz0(fn, b, s1s2[eqval^1], s1s2[eqval], arg0, cls); + + /* collapse trivial/constant jnz to jmp */ + v = 1; + is0 = 0; + if (b->s1 == b->s2 + || isconbits(fn, b->jmp.arg, &v) + || is0non0(fn, b, b->jmp.arg, Kw, &is0)) { + if (v == 0 || is0) + b->s1 = b->s2; + /* we later move active ins out of dead blks */ + b->s2 = 0; + b->jmp.type = Jjmp; + rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/); + b->jmp.arg = R; + return; + } +} + +/* rebuild rpo pred use */ +/* careful not to lose active ins in dead blks */ +static void +rebuildcfg(Fn *fn) { + uint n, prevnblk; + Blk **prevrpo; + Blk *b; + Ins *i; + + prevnblk = fn->nblk; + prevrpo = emalloc(prevnblk * sizeof prevrpo[0]); + memcpy(prevrpo, fn->rpo, prevnblk * sizeof prevrpo[0]); + + fillcfg(fn); // TODO - OK? + + for (n=0; nid == -1u) { + assert(b != fn->start); + /* blk unreachable after GVN */ + for (i = b->ins; i < &b->ins[b->nins]; i++) + if (i->op != Onop) + if (!optab[i->op].ispinned) + if (gvndup(i, 0) == i) + /* (possibly) active ins - add to start blk */ + addins(&fn->start->ins, &fn->start->nins, i); + } + } + free(prevrpo); +} + +/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */ +/* require rpo pred ssa use */ +/* recreates rpo */ +/* breaks pred use dom ssa (GCM fixes ssa) */ +void +gvn(Fn *fn) +{ + uint n, nins; + Blk *b; + Phi *p; + + /* facilitate jnz simplification */ + blkmerge(fn); + fillcfg(fn); + filluse(fn); + filldom(fn); + + con01[0] = getcon(0, fn); + con01[1] = getcon(1, fn); + + nins = 0; + for (b=fn->start; b; b=b->link) + for (p = b->phi; p; p = p->link) + p->visit = 0; + + /* facilitate ext elimination */ + fillloop(fn); + narrowpars(fn); + filluse(fn); + ssacheck(fn); + + for (b=fn->start; b; b=b->link) + nins += b->nins; + + gvntbln = nins + nins/2; + gvntbl = emalloc(gvntbln * sizeof gvntbl[0]); + lookupn = 0; + proben = 0; + maxproben = 0; + + /* GVN */ + clrbvisit(fn); + for (n=0; nnblk; n++) { + b = fn->rpo[n]; + dedupphis(fn, b); + dedupins(fn, b); + dedupjmp(fn, b); + } + + rebuildcfg(fn); + + free(gvntbl); + gvntbl = 0; + gvntbln = 0; + lookupn = 0; + proben = 0; + maxproben = 0; + + if (debug['G']) { + fprintf(stderr, "\n> After GVN:\n"); + printfn(fn, stderr); + } +} diff --git a/ins.c b/ins.c new file mode 100644 index 0000000..1935a95 --- /dev/null +++ b/ins.c @@ -0,0 +1,195 @@ +#include "all.h" + +void +addins(Ins **pvins, uint *pnins, Ins *i) +{ + if (i->op == Onop) + return; + vgrow(pvins, ++(*pnins)); + (*pvins)[(*pnins)-1] = *i; +} + +void +addbins(Blk *b, Ins **pvins, uint *pnins) +{ + Ins *i; + + for (i = b->ins; i < &b->ins[b->nins]; i++) + addins(pvins, pnins, i); +} + +static int +unusedins(Fn *fn, Ins *i); + +static int +unusedtmp(Fn *fn, Tmp *t) +{ + if (t->nuse == 0) + return 1; + if (t->nuse == 1) + if (t->use[0].type == UIns) + return unusedins(fn, t->use[0].u.ins); + return 0; +} + +static int +unusedtmpref(Fn *fn, Ref r) +{ + if (rtype(r) != RTmp) + return 0; + return unusedtmp(fn, &fn->tmp[r.val]); +} + +static int +unusedins(Fn *fn, Ins *i) +{ + if (!INRANGE(i->op, Opar, Ocall)) + if (unusedtmpref(fn, i->to)) + return 1; + return 0; +} + +/* replace unused ins with nop */ +/* needs use; breaks use */ +void +nopunused(Fn *fn) +{ + Blk *b; + Ins *i; + + for (b = fn->start; b; b = b->link) + for (i = b->ins; i < &b->ins[b->nins]; i++) + if (unusedins(fn, i)) + *i = (Ins){.op = Onop}; +} + +typedef struct FromLoc FromLoc; +struct FromLoc { + InsLoc from; + uint ton; +}; + +static int +loccmp(InsLoc *la, InsLoc *lb) +{ + if (la->bid != lb->bid) + return (int)la->bid - (int)lb->bid; + return la->insn - lb->insn; +} + +static int +tocmp(const void *a, const void *b) +{ + InsMov *ma, *mb; + + ma = (InsMov*)a; + mb = (InsMov*)b; + return loccmp(&ma->to, &mb->to); +} + +static int +fromcmp(const void *a, const void *b) +{ + FromLoc *fa, *fb; + + fa = (FromLoc*)a; + fb = (FromLoc*)b; + return loccmp(&fa->from, &fb->from); +} + +static int +loceq(InsLoc *a, InsLoc *b) +{ + return a->bid == b->bid && a->insn == b->insn; +} + +/* after, mov is sorted by to, and to.insn, from.insn are updated */ +void +movins(Fn *fn, InsMov *mov, uint nmov, int del) +{ + Blk *b, *b2; + uint bid, n, fromn, ton, ifromn, nins; + FromLoc *from; + uint *newbnins; + Ins **newbins, *vins; + + qsort(mov, nmov, sizeof mov[0], tocmp); + + from = emalloc(nmov * sizeof from[0]); + for (n = 0; n < nmov; n++) { + from[n].from = mov[n].from; + from[n].ton = n; + } + qsort(from, nmov, sizeof from[0], fromcmp); + + nins = 0; + vins = vnew(nins, sizeof vins[0], PFn); + + newbnins = emalloc(fn->nblk * sizeof newbnins[0]); + newbins = emalloc(fn->nblk * sizeof newbins[0]); + + /* populate new ins buffers */ + /* update mov to.insn, from insn */ + fromn = ton = 0; + for (bid = 0; bid < fn->nblk; bid++) { + /* no moves to this block */ + if (nmov <= ton || mov[ton].to.bid != bid) { + while (fromn < nmov && from[fromn].from.bid == bid) + fromn++; + continue; + } + b = fn->rpo[bid]; + nins = 0; + for (ifromn = 0; ifromn < b->nins; ifromn++) { + /* insert new ins, update to */ + while (ton < nmov && loceq(&mov[ton].to, &(InsLoc){.bid = bid, .insn = ifromn})) { + b2 = fn->rpo[mov[ton].from.bid]; + assert(mov[ton].from.insn < b2->nins); + addins(&vins, &nins, &b2->ins[mov[ton].from.insn]); + mov[ton++].to.insn = nins-1; + } + /* update from */ + while (fromn < nmov && loceq(&from[fromn].from, &(InsLoc){.bid = bid, .insn = ifromn})) + from[fromn++].from.insn = nins; + /* copy original ins */ + addins(&vins, &nins, &b->ins[ifromn]); + } + /* append new ins, update to */ + while (ton < nmov && mov[ton].to.bid == bid) { + assert(mov[ton].to.insn == b->nins); + b2 = fn->rpo[mov[ton].from.bid]; + assert(mov[ton].from.insn < b2->nins); + addins(&vins, &nins, &b2->ins[mov[ton].from.insn]); + mov[ton++].to.insn = nins-1; + } + assert(ifromn == b->nins); + /* idup(&newbins[bid], vins, nins); */ + newbins[bid] = vins; + vins = vnew(0, sizeof vins[0], PFn); + newbnins[bid] = nins; + } + assert(fromn == nmov); + assert(ton == nmov); + + /* install new b->ins */ + for (bid = 0; bid < fn->nblk; bid++) { + if (newbnins[bid] == 0) + continue; + b = fn->rpo[bid]; + b->ins = newbins[bid]; + b->nins = newbnins[bid]; + } + + /* remove from ins, update mov from insn */ + for (fromn = 0; fromn < nmov; fromn++) { + b = fn->rpo[from[fromn].from.bid]; + assert(from[fromn].from.insn < b->nins); + if (del) + b->ins[from[fromn].from.insn] = (Ins){.op = Onop}; + mov[from[fromn].ton].from.insn = from[fromn].from.insn; + } + + free(from); + free(newbins); + free(newbnins); +} diff --git a/main.c b/main.c index 977c55b..ba3ad0c 100644 --- a/main.c +++ b/main.c @@ -72,10 +72,18 @@ func(Fn *fn) fillalias(fn); coalesce(fn); filluse(fn); + filldom(fn); ssacheck(fn); - copy(fn); + gvn(fn); + fillcfg(fn); + filluse(fn); + ifelim(fn); + fillcfg(fn); filluse(fn); - fold(fn); + filldom(fn); + gcm(fn); + filluse(fn); + ssacheck(fn); T.abi1(fn); simpl(fn); fillcfg(fn); diff --git a/ops.h b/ops.h index beaa6f3..00d62a0 100644 --- a/ops.h +++ b/ops.h @@ -6,188 +6,185 @@ #define V(Imm) #endif -#ifndef P - #define P(CanFold, HasId, IdVal) +#ifndef F +#define F(CanFold, HasId, IdVal, Commutes, Associates, Idemp, IsCmpEq, IsCmpLgte, CmpEqVal, IsPinned) #endif - #define T(a,b,c,d,e,f,g,h) { \ {[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}, \ {[Kw]=K##e, [Kl]=K##f, [Ks]=K##g, [Kd]=K##h} \ } - /*********************/ /* PUBLIC OPERATIONS */ /*********************/ /* Arithmetic and Bits */ -O(add, T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(1) -O(sub, T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(0) -O(neg, T(w,l,s,d, x,x,x,x), P(1,0,0)) X(1,1,0) V(0) -O(div, T(w,l,s,d, w,l,s,d), P(1,1,1)) X(0,0,0) V(0) -O(rem, T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0) -O(udiv, T(w,l,e,e, w,l,e,e), P(1,1,1)) X(0,0,0) V(0) -O(urem, T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0) -O(mul, T(w,l,s,d, w,l,s,d), P(1,1,1)) X(2,0,0) V(0) -O(and, T(w,l,e,e, w,l,e,e), P(1,0,0)) X(2,1,0) V(1) -O(or, T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1) -O(xor, T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1) -O(sar, T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1) -O(shr, T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1) -O(shl, T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1) +O(add, T(w,l,s,d, w,l,s,d), F(1,1,0,1,1,0,0,0,0,0)) X(2,1,0) V(1) +O(sub, T(w,l,s,d, w,l,s,d), F(1,1,0,0,0,0,0,0,0,0)) X(2,1,0) V(0) +O(neg, T(w,l,s,d, x,x,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0) +O(div, T(w,l,s,d, w,l,s,d), F(1,1,1,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(rem, T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(udiv, T(w,l,e,e, w,l,e,e), F(1,1,1,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(urem, T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(mul, T(w,l,s,d, w,l,s,d), F(1,1,1,1,0,0,0,0,0,0)) X(2,0,0) V(0) +O(and, T(w,l,e,e, w,l,e,e), F(1,0,0,1,1,1,0,0,0,0)) X(2,1,0) V(1) +O(or, T(w,l,e,e, w,l,e,e), F(1,1,0,1,1,1,0,0,0,0)) X(2,1,0) V(1) +O(xor, T(w,l,e,e, w,l,e,e), F(1,1,0,1,1,0,0,0,0,0)) X(2,1,0) V(1) +O(sar, T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0,0)) X(1,1,0) V(1) +O(shr, T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0,0)) X(1,1,0) V(1) +O(shl, T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0,0)) X(1,1,0) V(1) /* Comparisons */ -O(ceqw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cnew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(csgew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(csgtw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cslew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(csltw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1) -O(cugew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cugtw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(culew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cultw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1) - -O(ceql, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cnel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(csgel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(csgtl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cslel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(csltl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1) -O(cugel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cugtl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(culel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cultl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1) - -O(ceqs, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cges, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cgts, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cles, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(clts, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cnes, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cos, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cuos, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0) - -O(ceqd, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cged, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cgtd, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cled, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cltd, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cned, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cod, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) -O(cuod, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0) +O(ceqw, T(w,w,e,e, w,w,e,e), F(1,1,1,1,0,0,1,0,1,0)) X(0,1,0) V(0) +O(cnew, T(w,w,e,e, w,w,e,e), F(1,1,0,1,0,0,1,0,0,0)) X(0,1,0) V(0) +O(csgew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(csgtw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0) +O(cslew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(csltw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1) +O(cugew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(cugtw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0) +O(culew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(cultw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1) + +O(ceql, T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,0,1,0,1,0)) X(0,1,0) V(0) +O(cnel, T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,0,1,0,0,0)) X(0,1,0) V(0) +O(csgel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(csgtl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0) +O(cslel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(csltl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1) +O(cugel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(cugtl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0) +O(culel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0) +O(cultl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1) + +O(ceqs, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cges, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cgts, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cles, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(clts, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cnes, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cos, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cuos, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) + +O(ceqd, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cged, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cgtd, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cled, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cltd, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cned, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cod, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) +O(cuod, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0) /* Memory */ -O(storeb, T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0) -O(storeh, T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0) -O(storew, T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0) -O(storel, T(l,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0) -O(stores, T(s,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0) -O(stored, T(d,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0) - -O(loadsb, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(loadub, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(loadsh, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(loaduh, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(loadsw, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(loaduw, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(load, T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,1) V(0) +O(storeb, T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(storeh, T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(storew, T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(storel, T(l,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(stores, T(s,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(stored, T(d,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) + +O(loadsb, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(loadub, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(loadsh, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(loaduh, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(loadsw, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(loaduw, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) +O(load, T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) /* Extensions and Truncations */ -O(extsb, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(extub, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(extsh, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(extuh, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(extsw, T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(extuw, T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0) - -O(exts, T(e,e,e,s, e,e,e,x), P(1,0,0)) X(0,0,1) V(0) -O(truncd, T(e,e,d,e, e,e,x,e), P(1,0,0)) X(0,0,1) V(0) -O(stosi, T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(stoui, T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(dtosi, T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(dtoui, T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0) -O(swtof, T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0) -O(uwtof, T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0) -O(sltof, T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0) -O(ultof, T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0) -O(cast, T(s,d,w,l, x,x,x,x), P(1,0,0)) X(0,0,1) V(0) +O(extsb, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(extub, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(extsh, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(extuh, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(extsw, T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(extuw, T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) + +O(exts, T(e,e,e,s, e,e,e,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(truncd, T(e,e,d,e, e,e,x,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(stosi, T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(stoui, T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(dtosi, T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(dtoui, T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(swtof, T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(uwtof, T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(sltof, T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(ultof, T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(cast, T(s,d,w,l, x,x,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) /* Stack Allocation */ -O(alloc4, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(alloc8, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(alloc16, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) +O(alloc4, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(alloc8, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(alloc16, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) /* Variadic Function Helpers */ -O(vaarg, T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(vastart, T(m,e,e,e, x,e,e,e), P(0,0,0)) X(0,0,0) V(0) +O(vaarg, T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(vastart, T(m,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) -O(copy, T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,1) V(0) +O(copy, T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) /* Debug */ -O(dbgloc, T(w,e,e,e, w,e,e,e), P(0,0,0)) X(0,0,1) V(0) +O(dbgloc, T(w,e,e,e, w,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0) /****************************************/ /* INTERNAL OPERATIONS (keep nop first) */ /****************************************/ /* Miscellaneous and Architecture-Specific Operations */ -O(nop, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,1) V(0) -O(addr, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(blit0, T(m,e,e,e, m,e,e,e), P(0,0,0)) X(0,1,0) V(0) -O(blit1, T(w,e,e,e, x,e,e,e), P(0,0,0)) X(0,1,0) V(0) -O(swap, T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,0,0) V(0) -O(sign, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(salloc, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(xidiv, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0) -O(xdiv, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0) -O(xcmp, T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,1,0) V(0) -O(xtest, T(w,l,e,e, w,l,e,e), P(0,0,0)) X(1,1,0) V(0) -O(acmp, T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0) -O(acmn, T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0) -O(afcmp, T(e,e,s,d, e,e,s,d), P(0,0,0)) X(0,0,0) V(0) -O(reqz, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(rnez, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0) +O(nop, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(addr, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(blit0, T(m,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0) +O(blit1, T(w,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0) +O(swap, T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0) +O(sign, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(salloc, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(xidiv, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0) +O(xdiv, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0) +O(xcmp, T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0) +O(xtest, T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0) +O(acmp, T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(acmn, T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(afcmp, T(e,e,s,d, e,e,s,d), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(reqz, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) +O(rnez, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0) /* Arguments, Parameters, and Calls */ -O(par, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(parsb, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(parub, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(parsh, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(paruh, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(parc, T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(pare, T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(arg, T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(argsb, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(argub, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(argsh, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(arguh, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(argc, T(e,x,e,e, e,l,e,e), P(0,0,0)) X(0,0,0) V(0) -O(arge, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0) -O(argv, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) -O(call, T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0) +O(par, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(parsb, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(parub, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(parsh, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(paruh, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(parc, T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(pare, T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(arg, T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(argsb, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(argub, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(argsh, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(arguh, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(argc, T(e,x,e,e, e,l,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(arge, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(argv, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) +O(call, T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0) /* Flags Setting */ -O(flagieq, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagine, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagisge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagisgt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagisle, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagislt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagiuge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagiugt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagiule, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagiult, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfeq, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfgt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfle, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagflt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfne, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfo, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) -O(flagfuo, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0) - +O(flagieq, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagine, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagisge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagisgt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagisle, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagislt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagiuge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagiugt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagiule, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagiult, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfeq, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfgt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfle, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagflt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfne, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfo, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) +O(flagfuo, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0) #undef T #undef X diff --git a/parse.c b/parse.c index ebfc39f..e2ffccf 100644 --- a/parse.c +++ b/parse.c @@ -15,11 +15,17 @@ enum { }; Op optab[NOp] = { -#undef P -#define P(cf, hi, id) .canfold = cf, .hasid = hi, .idval = id -#define O(op, t, p) [O##op]={.name = #op, .argcls = t, p}, +#undef F +#define F(cf, hi, id, co, as, im, ic, lg, cv, pn) \ + .canfold = cf, \ + .hasid = hi, .idval = id, \ + .commutes = co, .assoc = as, \ + .idemp = im, \ + .cmpeqwl = ic, .cmplgtewl = lg, .eqval = cv, \ + .ispinned = pn +#define O(op, k, flags) [O##op]={.name = #op, .argcls = k, flags}, #include "ops.h" -#undef P +#undef F }; typedef enum { diff --git a/reassoc.c b/reassoc.c new file mode 100644 index 0000000..25d88ea --- /dev/null +++ b/reassoc.c @@ -0,0 +1,101 @@ +#include "all.h" + +static void +ireassoc(Fn *fn, Blk *b, Ins *i, uint *pnim, InsMov **pim) +{ + Blk *b2; + Tmp *t; + Use *u; + int64_t v; + int x; + + assert(b->ins <= i && i < &b->ins[b->nins]); + if (!iscmp(i->op, &x, &x)) + if (!iskladdcon(fn, i, &v)) + return; + + assert(rtype(i->to) == RTmp); + t = &fn->tmp[i->to.val]; + for (u = t->use; u < &t->use[t->nuse]; u++) { + if (u->type == UPhi) + continue; + if (u->type == UIns) + if (INRANGE(u->u.ins->op, Oarg, Ocall)) + continue; + b2 = fn->rpo[u->bid]; + vgrow(pim, ++(*pnim)); + (*pim)[(*pnim)-1] = (InsMov){ + .from = {.bid = b->id, .insn = i-b->ins}, + .to = { + .bid = u->bid, + .insn = u->type == UJmp ? b2->nins : u->u.ins-b2->ins + }}; + } +} + +static void +trealloc(Fn *fn, Blk *b, Ins *i) +{ + Ins *i2; + Ref r; + + assert(b->ins <= i && i < &b->ins[b->nins]); + r = newtmp("rea", i->cls, fn); + if (i < &b->ins[b->nins-1]) { + i2 = i+1; + /* special case of both args of target instruction reassociated */ + if (!req(i->to, i2->arg[0]) && !req(i->to, i2->arg[1])) { + assert(i < &b->ins[b->nins-2]); + i2 = i+2; + } + assert(req(i->to, i2->arg[0]) || req(i->to, i2->arg[1])); + if (req(i->to, i2->arg[0])) + i2->arg[0] = r; + if (req(i->to, i2->arg[1])) + i2->arg[1] = r; + } else { + assert(req(i->to, b->jmp.arg)); + b->jmp.arg = r; + } + i->to = r; +} + +/* Redistribute trivial ops to point of use. */ +/* Reduces register pressure. */ +/* needs rpo, use; breaks use */ +void +reassoc(Fn *fn) +{ + Blk *b; + Ins *i; + InsMov *im; + uint n, nim; + + nim = 0; + im = vnew(nim, sizeof im[0], PHeap); + + /* identify trivial ins */ + for (b = fn->start; b; b = b->link) { + for (i = b->ins; i < &b->ins[b->nins]; i++) + ireassoc(fn, b, i, &nim, &im); + } + + /* duplicate trivial ins */ + movins(fn, im, nim, 0/*!del*/); + + /* create new tmps for dups */ + for (n = 0; n < nim; n++) { + b = fn->rpo[im[n].to.bid]; + i = &b->ins[im[n].to.insn]; + trealloc(fn, b, i); + } + + /* delete (now) unused ins */ + filluse(fn); + nopunused(fn); + + if (debug['G']) { + fprintf(stderr, "\n> After Reassociation:\n"); + printfn(fn, stderr); + } +} diff --git a/ssa.c b/ssa.c index 929301e..4e3d2a5 100644 --- a/ssa.c +++ b/ssa.c @@ -1,7 +1,7 @@ #include "all.h" #include -static void +void adduse(Tmp *tmp, int ty, Blk *b, ...) { Use *u; diff --git a/test/_gcm1.ssa b/test/_gcm1.ssa new file mode 100644 index 0000000..719cddb --- /dev/null +++ b/test/_gcm1.ssa @@ -0,0 +1,48 @@ +export +function w $ifmv(w %p1, w %p2, w %p3) { +@start +@entry + %rt =w add %p2, %p3 # gcm moves to @true + %rf =w sub %p2, %p3 # gcm moves to @false + jnz %p1, @true, @false +@true + %r =w copy %rt + jmp @exit +@false + %r =w copy %rf + jmp @exit +@exit + ret %r +} + +export +function w $hoist1(w %p1, w %p2, w %p3) { +@start +@entry + %n =w copy 0 + %i =w copy %p1 +@loop + %base =w add %p2, %p3 # gcm moves to @exit + %i =w sub %i, 1 + %n =w add %n, 1 + jnz %i, @loop, @exit +@exit + %r =w add %base, %n + ret %r +} + +export +function w $hoist2(w %p1, w %p2, w %p3) { +@start +@entry + %n =w copy 0 + %i =w copy %p1 +@loop + %base =w add %p2, %p3 # gcm moves to @entry + %i =w sub %i, 1 + %n =w add %n, %base + jnz %i, @loop, @exit +@exit + %r =w add %base, %n + ret %r +} diff --git a/test/_gcm2.ssa b/test/_gcm2.ssa new file mode 100644 index 0000000..baeb4a4 --- /dev/null +++ b/test/_gcm2.ssa @@ -0,0 +1,43 @@ +# Programs from "Global Code Motion Global Value Numbering" by Cliff Click +# https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf + +# GCM program in Figure 1 + +function w $gcm_test(w %a){ +@start + %i.0 =w copy 0 +@loop + %i.1 =w phi @start %i.0, @loop %i.2 + %b =w add %a, 1 # early schedule moves to @start + %i.2 =w add %i.1, %b + %c =w mul %i.2, 2 # late schedule moves to @end + %x =w csltw %i.2, 10 + jnz %x, @loop, @end +@end + ret %c +} + +# GCM program in "Figure 3 x's definition does not dominate it's use" +# +# SSA contruction will insert phi instruction for "x" in @if_false +# preventing the "add" in @if_false from being moved to @if_true + +function $gcm_test2 (w %a){ +@start + %f =w copy 1 + %x =w copy 0 + %s.0 =w copy 0 +@loop + %s.1 = w phi @start %s.0, @if_false %s.2 + jnz %a, @if, @end +@if + jnz %f, @if_true, @if_false +@if_true + %f =w copy 0 + %x =w add %x, 1 +@if_false + %s.2 =w add %s.1, %x + jmp @loop +@end + ret +} diff --git a/test/_load-elim.ssa b/test/_load-elim.ssa new file mode 100644 index 0000000..faae478 --- /dev/null +++ b/test/_load-elim.ssa @@ -0,0 +1,17 @@ +# GCM can eliminate unused add/load instructions + +export +function w $f(l %p, w %c) { +@start + jnz %c, @true, @false +@true + %p1 =l add %p, 4 + %v1 =w loaduw %p1 + jmp @end +@false + %p2 =l add %p, 4 + %v2 =w loaduw %p2 + jmp @end +@end + ret 0 +} diff --git a/test/gvn-live-dead.ssa b/test/gvn-live-dead.ssa new file mode 100644 index 0000000..d47f05b --- /dev/null +++ b/test/gvn-live-dead.ssa @@ -0,0 +1,19 @@ +export +function w $test(w %p1, w %p2) { +@start +@entry + %t1 =w copy 1 + jnz %t1, @live, @dead1 +@live + %t2 =w add %p1, %p2 + ret %t2 +@dead1 + %t2 =w add %p1, %p2 # live ins in dead blk +@dead2 + jnz %t1, @live, @dead1 +} + +# >>> driver +# extern int test(int p1, int p2); +# int main() { return test(1, 2) != 3; } +# <<< diff --git a/test/non0jnz.ssa b/test/non0jnz.ssa new file mode 100644 index 0000000..33f9a96 --- /dev/null +++ b/test/non0jnz.ssa @@ -0,0 +1,31 @@ +# GVN 0/non-0 inference removes @yesyes, @yesno, @noyes, @nono + +export +function w $test(w %c) { +@start + jnz %c, @yes, @no +@yes + %c0 =w cnew %c, 0 + jnz %c0, @yesyes, @yesno +@yesyes + %rc =w copy 1 + jmp @end +@yesno + %rc =w copy 111 + jmp @end +@no + %c1 =w cnew %c, 0 + jnz %c1, @noyes, @nono +@noyes + %rc =w copy 222 + jmp @end +@nono + %rc =w copy 0 +@end + ret %rc +} + +# >>> driver +# int test(int); +# int main(void) { return test(0); } +# <<< diff --git a/util.c b/util.c index 5a0c89f..ba1f98f 100644 --- a/util.c +++ b/util.c @@ -229,6 +229,24 @@ iscmp(int op, int *pk, int *pc) return 1; } +static int INVCMPWL[] = { + /*Oceqw*/Ocnew, /*Ocnew*/Oceqw, + /*Ocsgew*/Ocsltw, /*Ocsgtw*/Ocslew, /*Ocslew*/Ocsgtw, /*Ocsltw*/Ocsgew, + /*Ocugew*/Ocultw, /*Ocugtw*/Oculew, /*Oculew*/Ocugtw, /*Ocultw*/Ocugew, + /*Oceql*/Ocnel, /*Ocnel*/Oceql, + /*Ocsgel*/Ocsltl, /*Ocsgtl*/Ocslel, /*Ocslel*/Ocsgtl, /*Ocsltl*/Ocsgel, + /*Ocugel*/Ocultl, /*Ocugtl*/Oculel, /*Oculel*/Ocugtl, /*Ocultl*/Ocugel, +}; + +int +invcmpwl(int cmp) +{ + assert(Oceqw <= cmp && cmp <= Ocultl); + return INVCMPWL[cmp - Oceqw]; +} + + + int argcls(Ins *i, int n) { @@ -338,6 +356,23 @@ phicls(int t, Tmp *tmp) return t1; } +uint +phiargn(Phi *p, Blk *b) +{ + uint n; + + for (n = 0; n < p->narg; n++) + if (p->blk[n] == b) + return n; + die("unreachable"); +} + +Ref +phiarg(Phi *p, Blk *b) +{ + return p->arg[phiargn(p, b)]; +} + Ref newtmp(char *prfx, int k, Fn *fn) { @@ -421,6 +456,27 @@ addcon(Con *c0, Con *c1, int m) return 1; } +int +isconbits(Fn *fn, Ref r, int64_t *v) +{ + Con *c; + + if (rtype(r) == RCon) { + c = &fn->con[r.val]; + if (c->type == CBits) { + *v = c->bits.i; + return 1; + } + } + return 0; +} + +int +istmpconbits(Fn *fn, Ins *i, int64_t *v) +{ + return rtype(i->arg[0]) == RTmp && isconbits(fn, i->arg[1], v); +} + void salloc(Ref rt, Ref rs, Fn *fn) { -- cgit v1.2.3