diff options
| author | Roland Paterson-Jones <[email protected]> | 2024-06-19 16:48:11 +0200 |
|---|---|---|
| committer | Quentin Carbonneaux <[email protected]> | 2025-03-14 09:58:37 +0100 |
| commit | c2ff93e75e5f6df8e1679120b18f0d5884deab2b (patch) | |
| tree | b0f728874af29ea01e29f0575b4e5d8a3228a252 /copy.c | |
| parent | 9e36cbe4d8f8c9ff3d739ff9ead2a4a4988c0904 (diff) | |
Global Value Numbering / Global Code Motion
More or less as proposed in its ninth iteration with the
addition of a gcmmove() functionality to restore coherent
local schedules.
Changes since RFC 8:
Features:
- generalization of phi 1/0 detection
- collapse linear jmp chains before GVN; simplifies if-graph
detection used in 0/non-0 value inference and if-elim...
- infer 0/non-0 values from dominating blk jnz; eliminates
redundant cmp eq/ne 0 and associated jnz/blocks, for example
redundant null pointer checks (hare codebase likes this)
- remove (emergent) empty if-then-else graphlets between GVN and
GCM; improves GCM instruction placement, particularly cmps.
- merge %addr =l add %addr1, N sequences - reduces tmp count,
register pressure.
- squash consecutive associative ops with constant args, e.g.
t1 = add t, N ... t2 = add t2, M -> t2 = add t, N+M
Bug Fixes:
- remove "cmp eq/ne of non-identical RCon's " in copyref().
RCon's are not guaranteed to be dedup'ed, and symbols can
alias.
Codebase:
- moved some stuff into cfg.c including blkmerge()
- some refactoring in gvn.c
- simplification of reassoc.c - always reassoc all cmp ops
and Kl add %t, N. Better on coremark, smaller codebase.
- minor simplification of movins() - use vins
Testing - standard QBE, cproc, hare, harec, coremark
[still have Rust build issues with latest roland]
Benchmark
- coremark is ~15%+ faster than master
- hare "HARETEST_INCLUDE='slow' make check" ~8% faster
(crypto::sha1::sha1_1gb is biggest obvious win - ~25% faster)
Changes since RFC 7:
Bug fixes:
- remove isbad4gcm() in GVN/GCM - it is unsound due to different state
at GVN vs GCM time; replace with "reassociation" pass after GCM
- fix intra-blk use-before-def after GCM
- prevent GVN from deduping trapping instructions cos GCM will not
move them
- remove cmp eq/ne identical arg copy detection for floating point, it
is not valid for NaN
- fix cges/cged flagged as commutative in ops.h instead of cnes/cned
respectively; just a typo
Minor features:
- copy detection handles cmp le/lt/ge/gt with identical args
- treat (integer) div/rem by non-zero constant as non-trapping
- eliminate add N/sub N pairs in copy detection
- maintain accurate tmp use in GVN; not strictly necessary but enables
interim global state sanity checking
- "reassociation" of trivial constant offset load/store addresses, and
cmp ops with point-of-use in pass after GCM
- normalise commutative op arg order - e.g. op con, tmp -> op tmp, con
to simplify copy detection and GVN instruction dedup
Codebase:
- split out core copy detection and constant folding (back) out into
copy.c, fold.c respectively; gvn.c was getting monolithic
- generic support for instruction moving in ins.c - used by GCM and
reassoc
- new reassociation pass in reassoc.c
- other minor clean-up/refactor
Changes since RFC 6:
- More ext elimination in GVN by examination of def and use bit width
- elimination of redundant and mask by bit width examination
- Incorporation of Song's patch
Changes since RFC 5:
- avoidance of "bad" candidates for GVN/GCM - trivial address offset
calculations, and comparisons
- more copy detection mostly around boolean values
- allow elimination of unused load, alloc, trapping instructions
- detection of trivial boolean v ? 1 : 0 phi patterns
- bug fix for (removal of) "chg" optimisation in ins recreation - it
was missing removal of unused instructions in some cases
ifelim() between GVN and GCM; deeper nopunused()
Diffstat (limited to 'copy.c')
| -rw-r--r-- | copy.c | 475 |
1 files changed, 295 insertions, 180 deletions
@@ -1,217 +1,332 @@ #include "all.h" -static int -iscon(Ref r, int64_t bits, Fn *fn) +static uint +u64_wbits(uint64_t v) { - return rtype(r) == RCon - && fn->con[r.val].type == CBits - && fn->con[r.val].bits.i == bits; + uint n; + + n = 0; + if (v >> 32) { n += 32; v >>= 32; } + if (v >> 16) { n += 16; v >>= 16; } + if (v >> 8) { n += 8; v >>= 8; } + if (v >> 4) { n += 4; v >>= 4; } + if (v >> 2) { n += 2; v >>= 2; } + if (v >> 1) { n += 1; v >>= 1; } + return n+v; } static int -iscopy(Ins *i, Ref r, Fn *fn) +EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 }; + +static uint +EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 }; + +static uint +EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 }; + +static uint +STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 }; + +/* is the ref used only as a narrow value? */ +static int +usewidthle(Fn *fn, Ref r, uint wbits) { - static bits extcpy[] = { - [WFull] = 0, - [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw), - [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw), - [Wsh] = BIT(Wsh) | BIT(Wsw), - [Wuh] = BIT(Wuh) | BIT(Wuw), - [Wsw] = BIT(Wsw), - [Wuw] = BIT(Wuw), - }; - Op *op; - bits b; Tmp *t; + Use *u; + Phi *p; + int b; + Ins *i; + Ref rc; + int64_t v; - if (i->op == Ocopy) + if (isconbits(fn, r, &v)) + if (u64_wbits(v) <= wbits) return 1; - op = &optab[i->op]; - if (op->hasid && KBASE(i->cls) == 0) - return iscon(i->arg[1], op->idval, fn); - if (!isext(i->op) || rtype(r) != RTmp) + if (rtype(r) != RTmp) return 0; - if (i->op == Oextsw || i->op == Oextuw) - if (i->cls == Kw) - return 1; - t = &fn->tmp[r.val]; - assert(KBASE(t->cls) == 0); - if (i->cls == Kl && t->cls == Kw) + for (u = t->use; u < &t->use[t->nuse]; u++) { + switch (u->type) { + case UPhi: + p = u->u.phi; + if (p->visit) + continue; + p->visit = 1; + b = usewidthle(fn, p->to, wbits); + p->visit = 0; + if (b) + continue; + break; + case UIns: + i = u->u.ins; + assert(i != 0); + if (i->op == Ocopy) + if (usewidthle(fn, i->to, wbits)) + continue; + if (isext(i->op)) { + if (EXTW[i->op - Oextsb] <= wbits) + continue; + else + if (usewidthle(fn, i->to, wbits)) + continue;; + } + if (i->op == Oand) { + if (req(r, i->arg[0])) + rc = i->arg[1]; + else { + assert(req(r, i->arg[1])); + rc = i->arg[0]; + } + if (isconbits(fn, rc, &v)) + if (u64_wbits(v) <= wbits) + continue; + break; + } + if (isstore(i->op)) + if (req(r, i->arg[1])) + if (STW[i->op - Ostoreb] > wbits) + continue; + break; + default: + break; + } return 0; - b = extcpy[t->width]; - return (BIT(Wsb + (i->op-Oextsb)) & b) != 0; + } + return 1; } -static Ref -copyof(Ref r, Ref *cpy) +static Phi* +findphi(Fn *fn, uint bid, Ref to) { - if (rtype(r) == RTmp && !req(cpy[r.val], R)) - return cpy[r.val]; - return r; + Phi *p; + for (p = fn->rpo[bid]->phi; p; p = p->link) + if (req(p->to, to)) + break; + assert(p); + return p; } -/* detects a cluster of phis/copies redundant with 'r'; - * the algorithm is inspired by Section 3.2 of "Simple - * and Efficient SSA Construction" by Braun M. et al. - */ -static void -phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn) +static uint +uint_min(uint v1, uint v2) { - Use **stk, *u, *u1; - uint nstk, a; - int t; - Ref r1; - Phi *p0; - - bszero(ts); - bszero(as); - p0 = &(Phi){.narg = 0}; - stk = *pstk; - nstk = 1; - stk[0] = &(Use){.type = UPhi, .u.phi = p}; - while (nstk) { - u = stk[--nstk]; - if (u->type == UIns && iscopy(u->u.ins, r, fn)) { - p = p0; - t = u->u.ins->to.val; - } - else if (u->type == UPhi) { - p = u->u.phi; - t = p->to.val; - } - else - continue; - if (bshas(ts, t)) - continue; - bsset(ts, t); - for (a=0; a<p->narg; a++) { - r1 = copyof(p->arg[a], cpy); - if (req(r1, r)) - continue; - if (rtype(r1) != RTmp) - return; - bsset(as, r1.val); + return v1 < v2 ? v1 : v2; +} + +/* is the ref def a narrow value? */ +static int +defwidthle(Fn *fn, Ref r, uint wbits) +{ + Tmp *t; + Phi *p; + Ins *i; + uint n; + int64_t v; + int x; + + if (isconbits(fn, r, &v)) + if (u64_wbits(v) <= wbits) + return 1; + if (rtype(r) != RTmp) + return 0; + t = &fn->tmp[r.val]; + if (t->cls != Kw) + return 0; + i = t->def; + if (i == 0) { + /* phi def */ + p = findphi(fn, t->bid, r); + if (p->visit) + return 1; + p->visit = 1; + for (n = 0; n < p->narg; n++) + if (!defwidthle(fn, p->arg[n], wbits)) { + p->visit = 0; + return 0; + } + p->visit = 0; + return 1; + } + /* ins def */ + if (i->op == Ocopy) + return defwidthle(fn, i->arg[0], wbits); + if (i->op == Oshr || i->op == Osar) { + if (isconbits(fn, i->arg[1], &v)) + if (0 < v && v <= 32) { + if (i->op == Oshr && 32-v <= wbits) + return 1; + if (0 <= v && v < 32 && wbits < 32) + return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v)); } - u = fn->tmp[t].use; - u1 = &u[fn->tmp[t].nuse]; - vgrow(pstk, nstk+(u1-u)); - stk = *pstk; - for (; u<u1; u++) - stk[nstk++] = u; + return defwidthle(fn, i->arg[0], wbits); + } + if (iscmp(i->op, &x, &x)) + return wbits >= 1; + if (i->op == Oand) + return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits); + if (i->op == Oor || i->op == Oxor) + return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits); + if (isext(i->op)) { + if (EXTSIGNED[i->op - Oextsb]) + return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb])); + if (EXTW[i->op - Oextsb] <= wbits) + return 1; + return defwidthle(fn, i->arg[0], wbits); } - bsdiff(as, ts); - if (!bscount(as)) - for (t=0; bsiter(ts, &t); t++) - cpy[t] = r; + return 0; +} + +/* is the ref a boolean - 0, 1 - value? */ +int +iswu1(Fn *fn, Ref r) +{ + return defwidthle(fn, r, 1); } -static void -subst(Ref *pr, Ref *cpy) +static int +isnarrowpar(Fn *fn, Ref r) { - assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R)); - *pr = copyof(*pr, cpy); + Tmp *t; + + if (rtype(r) != RTmp) + return 0; + t = &fn->tmp[r.val]; + if (t->bid != fn->start->id || t->def == 0) + return 0; + return ispar(t->def->op); } -/* requires use and dom, breaks use */ +/* Insert extub/extuh instructions in start for pars used only narrowly */ +/* needs use; breaks use */ void -copy(Fn *fn) +narrowpars(Fn *fn) { - BSet ts[1], as[1]; - Use **stk; - Phi *p, **pp; - Ins *i; Blk *b; - uint n, a, eq; - Ref *cpy, r, r1; - int t; - - bsinit(ts, fn->ntmp); - bsinit(as, fn->ntmp); - cpy = emalloc(fn->ntmp * sizeof cpy[0]); - stk = vnew(10, sizeof stk[0], PHeap); - - /* 1. build the copy-of map */ - for (n=0; n<fn->nblk; n++) { - b = fn->rpo[n]; - for (p=b->phi; p; p=p->link) { - assert(rtype(p->to) == RTmp); - if (!req(cpy[p->to.val], R)) - continue; - eq = 0; - r = R; - for (a=0; a<p->narg; a++) - if (p->blk[a]->id < n) { - r1 = copyof(p->arg[a], cpy); - if (req(r, R) || req(r, UNDEF)) - r = r1; - if (req(r1, r) || req(r1, UNDEF)) - eq++; - } - assert(!req(r, R)); - if (rtype(r) == RTmp - && !dom(fn->rpo[fn->tmp[r.val].bid], b)) - cpy[p->to.val] = p->to; - else if (eq == p->narg) - cpy[p->to.val] = r; - else { - cpy[p->to.val] = p->to; - phisimpl(p, r, cpy, &stk, ts, as, fn); - } - } - for (i=b->ins; i<&b->ins[b->nins]; i++) { - assert(rtype(i->to) <= RTmp); - if (!req(cpy[i->to.val], R)) - continue; - r = copyof(i->arg[0], cpy); - if (iscopy(i, r, fn)) - cpy[i->to.val] = r; - else - cpy[i->to.val] = i->to; - } - } + int loop; + Ins *i, *ins; + uint npar, nins; + enum O extop; + Ref r; - /* 2. remove redundant phis/copies - * and rewrite their uses */ - for (b=fn->start; b; b=b->link) { - for (pp=&b->phi; (p=*pp);) { - r = cpy[p->to.val]; - if (!req(r, p->to)) { - *pp = p->link; - continue; - } - for (a=0; a<p->narg; a++) - subst(&p->arg[a], cpy); - pp=&p->link; - } - for (i=b->ins; i<&b->ins[b->nins]; i++) { - r = cpy[i->to.val]; - if (!req(r, i->to)) { - *i = (Ins){.op = Onop}; - continue; - } - subst(&i->arg[0], cpy); - subst(&i->arg[1], cpy); + /* only useful for functions with loops */ + loop = 0; + for (b = fn->start; b; b = b->link) + if (b->loop > 1) { + loop = 1; + break; } - subst(&b->jmp.arg, cpy); + if (!loop) + return; + + b = fn->start; + npar = 0; + + for (i = b->ins; i < &b->ins[b->nins]; i++) { + if (!ispar(i->op)) + break; + npar++; } - if (debug['C']) { - fprintf(stderr, "\n> Copy information:"); - for (t=Tmp0; t<fn->ntmp; t++) { - if (req(cpy[t], R)) { - fprintf(stderr, "\n%10s not seen!", - fn->tmp[t].name); - } - else if (!req(cpy[t], TMP(t))) { - fprintf(stderr, "\n%10s copy of ", - fn->tmp[t].name); - printref(cpy[t], fn, stderr); + if (npar == 0) + return; + + nins = b->nins + npar; + ins = vnew(nins, sizeof ins[0], PFn); //alloc(nins * sizeof ins[0]); + memcpy(ins, b->ins, npar * sizeof ins[0]); + memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]); + b->ins = ins; + b->nins = nins; + + for (i = b->ins; i < &b->ins[b->nins]; i++) { + if (!ispar(i->op)) + break; + extop = Onop; + if (i->cls == Kw) + if (usewidthle(fn, i->to, 16)) { + if (usewidthle(fn, i->to, 8)) + extop = Oextub; + else + extop = Oextuh; } + if (extop == Onop) { + *(i+npar) = (Ins) {.op = Onop}; + } else { + r = newtmp("vw", i->cls, fn); + *(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}}; + i->to = r; } - fprintf(stderr, "\n\n> After copy elimination:\n"); - printfn(fn, stderr); } - vfree(stk); - free(cpy); +} + +/* used by GVN */ +Ref +copyref(Fn *fn, Blk *b, Ins *i) +{ + static bits extcpy[] = { + [WFull] = 0, + [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw), + [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw), + [Wsh] = BIT(Wsh) | BIT(Wsw), + [Wuh] = BIT(Wuh) | BIT(Wuw), + [Wsw] = BIT(Wsw), + [Wuw] = BIT(Wuw), + }; + bits bext; + Tmp *t; + int64_t v; + int is0; + + if (i->op == Ocopy) + return i->arg[0]; + + /* op identity value */ + if (optab[i->op].hasid) + if (KBASE(i->cls) == 0) /* integer only - fp NaN! */ + if (req(i->arg[1], con01[optab[i->op].idval])) + if (!optab[i->op].cmpeqwl || iswu1(fn, i->arg[0])) + return i->arg[0]; + + /* idempotent op with identical args */ + if (optab[i->op].idemp) + if (req(i->arg[0], i->arg[1])) + return i->arg[0]; + + /* integer cmp with identical args */ + if (optab[i->op].cmpeqwl || optab[i->op].cmplgtewl) + if (req(i->arg[0], i->arg[1])) + return con01[optab[i->op].eqval]; + + /* cmpeq/ne 0 with 0/non-0 inference from dominating jnz */ + if (optab[i->op].cmpeqwl) + if (req(i->arg[1], con01[0])) + if (is0non0(fn, b, i->arg[0], argcls(i,0), &is0)) + return con01[optab[i->op].eqval^is0^1]; + + /* redundant and mask */ + if (i->op == Oand) + if (isconbits(fn, i->arg[1], &v)) + if (((v+1) & v) == 0) /* v == 2^N-1 */ + if (defwidthle(fn, i->arg[0], u64_wbits(v))) + return i->arg[0]; + + if (!isext(i->op) || rtype(i->arg[0]) != RTmp) + return R; + if (i->op == Oextsw || i->op == Oextuw) + if (i->cls == Kw) + return i->arg[0]; + + t = &fn->tmp[i->arg[0].val]; + assert(KBASE(t->cls) == 0); + if (i->cls == Kl && t->cls == Kw) + return R; + bext = extcpy[t->width]; + if ((BIT(Wsb + (i->op-Oextsb)) & bext) != 0) + return i->arg[0]; + + if (!isnarrowpar(fn, i->arg[0])) + if (usewidthle(fn, i->to, EXTW[i->op - Oextsb])) + return i->arg[0]; + if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb])) + return i->arg[0]; + + return R; } |
