aboutsummaryrefslogtreecommitdiff
path: root/copy.c
diff options
context:
space:
mode:
authorRoland Paterson-Jones <[email protected]>2024-06-19 16:48:11 +0200
committerQuentin Carbonneaux <[email protected]>2025-03-14 09:58:37 +0100
commitc2ff93e75e5f6df8e1679120b18f0d5884deab2b (patch)
treeb0f728874af29ea01e29f0575b4e5d8a3228a252 /copy.c
parent9e36cbe4d8f8c9ff3d739ff9ead2a4a4988c0904 (diff)
Global Value Numbering / Global Code Motion
More or less as proposed in its ninth iteration with the addition of a gcmmove() functionality to restore coherent local schedules. Changes since RFC 8: Features: - generalization of phi 1/0 detection - collapse linear jmp chains before GVN; simplifies if-graph detection used in 0/non-0 value inference and if-elim... - infer 0/non-0 values from dominating blk jnz; eliminates redundant cmp eq/ne 0 and associated jnz/blocks, for example redundant null pointer checks (hare codebase likes this) - remove (emergent) empty if-then-else graphlets between GVN and GCM; improves GCM instruction placement, particularly cmps. - merge %addr =l add %addr1, N sequences - reduces tmp count, register pressure. - squash consecutive associative ops with constant args, e.g. t1 = add t, N ... t2 = add t2, M -> t2 = add t, N+M Bug Fixes: - remove "cmp eq/ne of non-identical RCon's " in copyref(). RCon's are not guaranteed to be dedup'ed, and symbols can alias. Codebase: - moved some stuff into cfg.c including blkmerge() - some refactoring in gvn.c - simplification of reassoc.c - always reassoc all cmp ops and Kl add %t, N. Better on coremark, smaller codebase. - minor simplification of movins() - use vins Testing - standard QBE, cproc, hare, harec, coremark [still have Rust build issues with latest roland] Benchmark - coremark is ~15%+ faster than master - hare "HARETEST_INCLUDE='slow' make check" ~8% faster (crypto::sha1::sha1_1gb is biggest obvious win - ~25% faster) Changes since RFC 7: Bug fixes: - remove isbad4gcm() in GVN/GCM - it is unsound due to different state at GVN vs GCM time; replace with "reassociation" pass after GCM - fix intra-blk use-before-def after GCM - prevent GVN from deduping trapping instructions cos GCM will not move them - remove cmp eq/ne identical arg copy detection for floating point, it is not valid for NaN - fix cges/cged flagged as commutative in ops.h instead of cnes/cned respectively; just a typo Minor features: - copy detection handles cmp le/lt/ge/gt with identical args - treat (integer) div/rem by non-zero constant as non-trapping - eliminate add N/sub N pairs in copy detection - maintain accurate tmp use in GVN; not strictly necessary but enables interim global state sanity checking - "reassociation" of trivial constant offset load/store addresses, and cmp ops with point-of-use in pass after GCM - normalise commutative op arg order - e.g. op con, tmp -> op tmp, con to simplify copy detection and GVN instruction dedup Codebase: - split out core copy detection and constant folding (back) out into copy.c, fold.c respectively; gvn.c was getting monolithic - generic support for instruction moving in ins.c - used by GCM and reassoc - new reassociation pass in reassoc.c - other minor clean-up/refactor Changes since RFC 6: - More ext elimination in GVN by examination of def and use bit width - elimination of redundant and mask by bit width examination - Incorporation of Song's patch Changes since RFC 5: - avoidance of "bad" candidates for GVN/GCM - trivial address offset calculations, and comparisons - more copy detection mostly around boolean values - allow elimination of unused load, alloc, trapping instructions - detection of trivial boolean v ? 1 : 0 phi patterns - bug fix for (removal of) "chg" optimisation in ins recreation - it was missing removal of unused instructions in some cases ifelim() between GVN and GCM; deeper nopunused()
Diffstat (limited to 'copy.c')
-rw-r--r--copy.c475
1 files changed, 295 insertions, 180 deletions
diff --git a/copy.c b/copy.c
index da3987f..95c7e00 100644
--- a/copy.c
+++ b/copy.c
@@ -1,217 +1,332 @@
#include "all.h"
-static int
-iscon(Ref r, int64_t bits, Fn *fn)
+static uint
+u64_wbits(uint64_t v)
{
- return rtype(r) == RCon
- && fn->con[r.val].type == CBits
- && fn->con[r.val].bits.i == bits;
+ uint n;
+
+ n = 0;
+ if (v >> 32) { n += 32; v >>= 32; }
+ if (v >> 16) { n += 16; v >>= 16; }
+ if (v >> 8) { n += 8; v >>= 8; }
+ if (v >> 4) { n += 4; v >>= 4; }
+ if (v >> 2) { n += 2; v >>= 2; }
+ if (v >> 1) { n += 1; v >>= 1; }
+ return n+v;
}
static int
-iscopy(Ins *i, Ref r, Fn *fn)
+EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 };
+
+static uint
+EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 };
+
+static uint
+EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 };
+
+static uint
+STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 };
+
+/* is the ref used only as a narrow value? */
+static int
+usewidthle(Fn *fn, Ref r, uint wbits)
{
- static bits extcpy[] = {
- [WFull] = 0,
- [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
- [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
- [Wsh] = BIT(Wsh) | BIT(Wsw),
- [Wuh] = BIT(Wuh) | BIT(Wuw),
- [Wsw] = BIT(Wsw),
- [Wuw] = BIT(Wuw),
- };
- Op *op;
- bits b;
Tmp *t;
+ Use *u;
+ Phi *p;
+ int b;
+ Ins *i;
+ Ref rc;
+ int64_t v;
- if (i->op == Ocopy)
+ if (isconbits(fn, r, &v))
+ if (u64_wbits(v) <= wbits)
return 1;
- op = &optab[i->op];
- if (op->hasid && KBASE(i->cls) == 0)
- return iscon(i->arg[1], op->idval, fn);
- if (!isext(i->op) || rtype(r) != RTmp)
+ if (rtype(r) != RTmp)
return 0;
- if (i->op == Oextsw || i->op == Oextuw)
- if (i->cls == Kw)
- return 1;
-
t = &fn->tmp[r.val];
- assert(KBASE(t->cls) == 0);
- if (i->cls == Kl && t->cls == Kw)
+ for (u = t->use; u < &t->use[t->nuse]; u++) {
+ switch (u->type) {
+ case UPhi:
+ p = u->u.phi;
+ if (p->visit)
+ continue;
+ p->visit = 1;
+ b = usewidthle(fn, p->to, wbits);
+ p->visit = 0;
+ if (b)
+ continue;
+ break;
+ case UIns:
+ i = u->u.ins;
+ assert(i != 0);
+ if (i->op == Ocopy)
+ if (usewidthle(fn, i->to, wbits))
+ continue;
+ if (isext(i->op)) {
+ if (EXTW[i->op - Oextsb] <= wbits)
+ continue;
+ else
+ if (usewidthle(fn, i->to, wbits))
+ continue;;
+ }
+ if (i->op == Oand) {
+ if (req(r, i->arg[0]))
+ rc = i->arg[1];
+ else {
+ assert(req(r, i->arg[1]));
+ rc = i->arg[0];
+ }
+ if (isconbits(fn, rc, &v))
+ if (u64_wbits(v) <= wbits)
+ continue;
+ break;
+ }
+ if (isstore(i->op))
+ if (req(r, i->arg[1]))
+ if (STW[i->op - Ostoreb] > wbits)
+ continue;
+ break;
+ default:
+ break;
+ }
return 0;
- b = extcpy[t->width];
- return (BIT(Wsb + (i->op-Oextsb)) & b) != 0;
+ }
+ return 1;
}
-static Ref
-copyof(Ref r, Ref *cpy)
+static Phi*
+findphi(Fn *fn, uint bid, Ref to)
{
- if (rtype(r) == RTmp && !req(cpy[r.val], R))
- return cpy[r.val];
- return r;
+ Phi *p;
+ for (p = fn->rpo[bid]->phi; p; p = p->link)
+ if (req(p->to, to))
+ break;
+ assert(p);
+ return p;
}
-/* detects a cluster of phis/copies redundant with 'r';
- * the algorithm is inspired by Section 3.2 of "Simple
- * and Efficient SSA Construction" by Braun M. et al.
- */
-static void
-phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn)
+static uint
+uint_min(uint v1, uint v2)
{
- Use **stk, *u, *u1;
- uint nstk, a;
- int t;
- Ref r1;
- Phi *p0;
-
- bszero(ts);
- bszero(as);
- p0 = &(Phi){.narg = 0};
- stk = *pstk;
- nstk = 1;
- stk[0] = &(Use){.type = UPhi, .u.phi = p};
- while (nstk) {
- u = stk[--nstk];
- if (u->type == UIns && iscopy(u->u.ins, r, fn)) {
- p = p0;
- t = u->u.ins->to.val;
- }
- else if (u->type == UPhi) {
- p = u->u.phi;
- t = p->to.val;
- }
- else
- continue;
- if (bshas(ts, t))
- continue;
- bsset(ts, t);
- for (a=0; a<p->narg; a++) {
- r1 = copyof(p->arg[a], cpy);
- if (req(r1, r))
- continue;
- if (rtype(r1) != RTmp)
- return;
- bsset(as, r1.val);
+ return v1 < v2 ? v1 : v2;
+}
+
+/* is the ref def a narrow value? */
+static int
+defwidthle(Fn *fn, Ref r, uint wbits)
+{
+ Tmp *t;
+ Phi *p;
+ Ins *i;
+ uint n;
+ int64_t v;
+ int x;
+
+ if (isconbits(fn, r, &v))
+ if (u64_wbits(v) <= wbits)
+ return 1;
+ if (rtype(r) != RTmp)
+ return 0;
+ t = &fn->tmp[r.val];
+ if (t->cls != Kw)
+ return 0;
+ i = t->def;
+ if (i == 0) {
+ /* phi def */
+ p = findphi(fn, t->bid, r);
+ if (p->visit)
+ return 1;
+ p->visit = 1;
+ for (n = 0; n < p->narg; n++)
+ if (!defwidthle(fn, p->arg[n], wbits)) {
+ p->visit = 0;
+ return 0;
+ }
+ p->visit = 0;
+ return 1;
+ }
+ /* ins def */
+ if (i->op == Ocopy)
+ return defwidthle(fn, i->arg[0], wbits);
+ if (i->op == Oshr || i->op == Osar) {
+ if (isconbits(fn, i->arg[1], &v))
+ if (0 < v && v <= 32) {
+ if (i->op == Oshr && 32-v <= wbits)
+ return 1;
+ if (0 <= v && v < 32 && wbits < 32)
+ return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v));
}
- u = fn->tmp[t].use;
- u1 = &u[fn->tmp[t].nuse];
- vgrow(pstk, nstk+(u1-u));
- stk = *pstk;
- for (; u<u1; u++)
- stk[nstk++] = u;
+ return defwidthle(fn, i->arg[0], wbits);
+ }
+ if (iscmp(i->op, &x, &x))
+ return wbits >= 1;
+ if (i->op == Oand)
+ return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits);
+ if (i->op == Oor || i->op == Oxor)
+ return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits);
+ if (isext(i->op)) {
+ if (EXTSIGNED[i->op - Oextsb])
+ return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb]));
+ if (EXTW[i->op - Oextsb] <= wbits)
+ return 1;
+ return defwidthle(fn, i->arg[0], wbits);
}
- bsdiff(as, ts);
- if (!bscount(as))
- for (t=0; bsiter(ts, &t); t++)
- cpy[t] = r;
+ return 0;
+}
+
+/* is the ref a boolean - 0, 1 - value? */
+int
+iswu1(Fn *fn, Ref r)
+{
+ return defwidthle(fn, r, 1);
}
-static void
-subst(Ref *pr, Ref *cpy)
+static int
+isnarrowpar(Fn *fn, Ref r)
{
- assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R));
- *pr = copyof(*pr, cpy);
+ Tmp *t;
+
+ if (rtype(r) != RTmp)
+ return 0;
+ t = &fn->tmp[r.val];
+ if (t->bid != fn->start->id || t->def == 0)
+ return 0;
+ return ispar(t->def->op);
}
-/* requires use and dom, breaks use */
+/* Insert extub/extuh instructions in start for pars used only narrowly */
+/* needs use; breaks use */
void
-copy(Fn *fn)
+narrowpars(Fn *fn)
{
- BSet ts[1], as[1];
- Use **stk;
- Phi *p, **pp;
- Ins *i;
Blk *b;
- uint n, a, eq;
- Ref *cpy, r, r1;
- int t;
-
- bsinit(ts, fn->ntmp);
- bsinit(as, fn->ntmp);
- cpy = emalloc(fn->ntmp * sizeof cpy[0]);
- stk = vnew(10, sizeof stk[0], PHeap);
-
- /* 1. build the copy-of map */
- for (n=0; n<fn->nblk; n++) {
- b = fn->rpo[n];
- for (p=b->phi; p; p=p->link) {
- assert(rtype(p->to) == RTmp);
- if (!req(cpy[p->to.val], R))
- continue;
- eq = 0;
- r = R;
- for (a=0; a<p->narg; a++)
- if (p->blk[a]->id < n) {
- r1 = copyof(p->arg[a], cpy);
- if (req(r, R) || req(r, UNDEF))
- r = r1;
- if (req(r1, r) || req(r1, UNDEF))
- eq++;
- }
- assert(!req(r, R));
- if (rtype(r) == RTmp
- && !dom(fn->rpo[fn->tmp[r.val].bid], b))
- cpy[p->to.val] = p->to;
- else if (eq == p->narg)
- cpy[p->to.val] = r;
- else {
- cpy[p->to.val] = p->to;
- phisimpl(p, r, cpy, &stk, ts, as, fn);
- }
- }
- for (i=b->ins; i<&b->ins[b->nins]; i++) {
- assert(rtype(i->to) <= RTmp);
- if (!req(cpy[i->to.val], R))
- continue;
- r = copyof(i->arg[0], cpy);
- if (iscopy(i, r, fn))
- cpy[i->to.val] = r;
- else
- cpy[i->to.val] = i->to;
- }
- }
+ int loop;
+ Ins *i, *ins;
+ uint npar, nins;
+ enum O extop;
+ Ref r;
- /* 2. remove redundant phis/copies
- * and rewrite their uses */
- for (b=fn->start; b; b=b->link) {
- for (pp=&b->phi; (p=*pp);) {
- r = cpy[p->to.val];
- if (!req(r, p->to)) {
- *pp = p->link;
- continue;
- }
- for (a=0; a<p->narg; a++)
- subst(&p->arg[a], cpy);
- pp=&p->link;
- }
- for (i=b->ins; i<&b->ins[b->nins]; i++) {
- r = cpy[i->to.val];
- if (!req(r, i->to)) {
- *i = (Ins){.op = Onop};
- continue;
- }
- subst(&i->arg[0], cpy);
- subst(&i->arg[1], cpy);
+ /* only useful for functions with loops */
+ loop = 0;
+ for (b = fn->start; b; b = b->link)
+ if (b->loop > 1) {
+ loop = 1;
+ break;
}
- subst(&b->jmp.arg, cpy);
+ if (!loop)
+ return;
+
+ b = fn->start;
+ npar = 0;
+
+ for (i = b->ins; i < &b->ins[b->nins]; i++) {
+ if (!ispar(i->op))
+ break;
+ npar++;
}
- if (debug['C']) {
- fprintf(stderr, "\n> Copy information:");
- for (t=Tmp0; t<fn->ntmp; t++) {
- if (req(cpy[t], R)) {
- fprintf(stderr, "\n%10s not seen!",
- fn->tmp[t].name);
- }
- else if (!req(cpy[t], TMP(t))) {
- fprintf(stderr, "\n%10s copy of ",
- fn->tmp[t].name);
- printref(cpy[t], fn, stderr);
+ if (npar == 0)
+ return;
+
+ nins = b->nins + npar;
+ ins = vnew(nins, sizeof ins[0], PFn); //alloc(nins * sizeof ins[0]);
+ memcpy(ins, b->ins, npar * sizeof ins[0]);
+ memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]);
+ b->ins = ins;
+ b->nins = nins;
+
+ for (i = b->ins; i < &b->ins[b->nins]; i++) {
+ if (!ispar(i->op))
+ break;
+ extop = Onop;
+ if (i->cls == Kw)
+ if (usewidthle(fn, i->to, 16)) {
+ if (usewidthle(fn, i->to, 8))
+ extop = Oextub;
+ else
+ extop = Oextuh;
}
+ if (extop == Onop) {
+ *(i+npar) = (Ins) {.op = Onop};
+ } else {
+ r = newtmp("vw", i->cls, fn);
+ *(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}};
+ i->to = r;
}
- fprintf(stderr, "\n\n> After copy elimination:\n");
- printfn(fn, stderr);
}
- vfree(stk);
- free(cpy);
+}
+
+/* used by GVN */
+Ref
+copyref(Fn *fn, Blk *b, Ins *i)
+{
+ static bits extcpy[] = {
+ [WFull] = 0,
+ [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
+ [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
+ [Wsh] = BIT(Wsh) | BIT(Wsw),
+ [Wuh] = BIT(Wuh) | BIT(Wuw),
+ [Wsw] = BIT(Wsw),
+ [Wuw] = BIT(Wuw),
+ };
+ bits bext;
+ Tmp *t;
+ int64_t v;
+ int is0;
+
+ if (i->op == Ocopy)
+ return i->arg[0];
+
+ /* op identity value */
+ if (optab[i->op].hasid)
+ if (KBASE(i->cls) == 0) /* integer only - fp NaN! */
+ if (req(i->arg[1], con01[optab[i->op].idval]))
+ if (!optab[i->op].cmpeqwl || iswu1(fn, i->arg[0]))
+ return i->arg[0];
+
+ /* idempotent op with identical args */
+ if (optab[i->op].idemp)
+ if (req(i->arg[0], i->arg[1]))
+ return i->arg[0];
+
+ /* integer cmp with identical args */
+ if (optab[i->op].cmpeqwl || optab[i->op].cmplgtewl)
+ if (req(i->arg[0], i->arg[1]))
+ return con01[optab[i->op].eqval];
+
+ /* cmpeq/ne 0 with 0/non-0 inference from dominating jnz */
+ if (optab[i->op].cmpeqwl)
+ if (req(i->arg[1], con01[0]))
+ if (is0non0(fn, b, i->arg[0], argcls(i,0), &is0))
+ return con01[optab[i->op].eqval^is0^1];
+
+ /* redundant and mask */
+ if (i->op == Oand)
+ if (isconbits(fn, i->arg[1], &v))
+ if (((v+1) & v) == 0) /* v == 2^N-1 */
+ if (defwidthle(fn, i->arg[0], u64_wbits(v)))
+ return i->arg[0];
+
+ if (!isext(i->op) || rtype(i->arg[0]) != RTmp)
+ return R;
+ if (i->op == Oextsw || i->op == Oextuw)
+ if (i->cls == Kw)
+ return i->arg[0];
+
+ t = &fn->tmp[i->arg[0].val];
+ assert(KBASE(t->cls) == 0);
+ if (i->cls == Kl && t->cls == Kw)
+ return R;
+ bext = extcpy[t->width];
+ if ((BIT(Wsb + (i->op-Oextsb)) & bext) != 0)
+ return i->arg[0];
+
+ if (!isnarrowpar(fn, i->arg[0]))
+ if (usewidthle(fn, i->to, EXTW[i->op - Oextsb]))
+ return i->arg[0];
+ if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb]))
+ return i->arg[0];
+
+ return R;
}