Global Value Numbering / Global Code Motion

More or less as proposed in its ninth iteration with the addition of a gcmmove() functionality to restore coherent local schedules. Changes since RFC 8: Features: - generalization of phi 1/0 detection - collapse linear jmp chains before GVN; simplifies if-graph detection used in 0/non-0 value inference and if-elim... - infer 0/non-0 values from dominating blk jnz; eliminates redundant cmp eq/ne 0 and associated jnz/blocks, for example redundant null pointer checks (hare codebase likes this) - remove (emergent) empty if-then-else graphlets between GVN and GCM; improves GCM instruction placement, particularly cmps. - merge %addr =l add %addr1, N sequences - reduces tmp count, register pressure. - squash consecutive associative ops with constant args, e.g. t1 = add t, N ... t2 = add t2, M -> t2 = add t, N+M Bug Fixes: - remove "cmp eq/ne of non-identical RCon's " in copyref(). RCon's are not guaranteed to be dedup'ed, and symbols can alias. Codebase: - moved some stuff into cfg.c including blkmerge() - some refactoring in gvn.c - simplification of reassoc.c - always reassoc all cmp ops and Kl add %t, N. Better on coremark, smaller codebase. - minor simplification of movins() - use vins Testing - standard QBE, cproc, hare, harec, coremark [still have Rust build issues with latest roland] Benchmark - coremark is ~15%+ faster than master - hare "HARETEST_INCLUDE='slow' make check" ~8% faster (crypto::sha1::sha1_1gb is biggest obvious win - ~25% faster) Changes since RFC 7: Bug fixes: - remove isbad4gcm() in GVN/GCM - it is unsound due to different state at GVN vs GCM time; replace with "reassociation" pass after GCM - fix intra-blk use-before-def after GCM - prevent GVN from deduping trapping instructions cos GCM will not move them - remove cmp eq/ne identical arg copy detection for floating point, it is not valid for NaN - fix cges/cged flagged as commutative in ops.h instead of cnes/cned respectively; just a typo Minor features: - copy detection handles cmp le/lt/ge/gt with identical args - treat (integer) div/rem by non-zero constant as non-trapping - eliminate add N/sub N pairs in copy detection - maintain accurate tmp use in GVN; not strictly necessary but enables interim global state sanity checking - "reassociation" of trivial constant offset load/store addresses, and cmp ops with point-of-use in pass after GCM - normalise commutative op arg order - e.g. op con, tmp -> op tmp, con to simplify copy detection and GVN instruction dedup Codebase: - split out core copy detection and constant folding (back) out into copy.c, fold.c respectively; gvn.c was getting monolithic - generic support for instruction moving in ins.c - used by GCM and reassoc - new reassociation pass in reassoc.c - other minor clean-up/refactor Changes since RFC 6: - More ext elimination in GVN by examination of def and use bit width - elimination of redundant and mask by bit width examination - Incorporation of Song's patch Changes since RFC 5: - avoidance of "bad" candidates for GVN/GCM - trivial address offset calculations, and comparisons - more copy detection mostly around boolean values - allow elimination of unused load, alloc, trapping instructions - detection of trivial boolean v ? 1 : 0 phi patterns - bug fix for (removal of) "chg" optimisation in ins recreation - it was missing removal of unused instructions in some cases ifelim() between GVN and GCM; deeper nopunused()
author: Roland Paterson-Jones <[email protected]> 2024-06-19 16:48:11 +0200
committer: Quentin Carbonneaux <[email protected]> 2025-03-14 09:58:37 +0100
commit: c2ff93e75e5f6df8e1679120b18f0d5884deab2b (patch)
tree: b0f728874af29ea01e29f0575b4e5d8a3228a252 /copy.c
parent: 9e36cbe4d8f8c9ff3d739ff9ead2a4a4988c0904 (diff)
1 files changed, 295 insertions, 180 deletions
diff --git a/copy.c b/copy.c
index da3987f..95c7e00 100644
--- a/copy.c
+++ b/copy.c
@@ -1,217 +1,332 @@
 #include "all.h"
 
-static int
-iscon(Ref r, int64_t bits, Fn *fn)
+static uint
+u64_wbits(uint64_t v)
 {
-	return rtype(r) == RCon
-		&& fn->con[r.val].type == CBits
-		&& fn->con[r.val].bits.i == bits;
+	uint n;
+
+	n = 0;
+	if (v >> 32) { n += 32; v >>= 32; }
+	if (v >> 16) { n += 16; v >>= 16; }
+	if (v >>  8) { n +=  8; v >>=  8; }
+	if (v >>  4) { n +=  4; v >>=  4; }
+	if (v >>  2) { n +=  2; v >>=  2; }
+	if (v >>  1) { n +=  1; v >>=  1; }
+	return n+v;
 }
 
 static int
-iscopy(Ins *i, Ref r, Fn *fn)
+EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 };
+
+static uint
+EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 };
+
+static uint
+EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 };
+
+static uint
+STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 };
+
+/* is the ref used only as a narrow value? */
+static int
+usewidthle(Fn *fn, Ref r, uint wbits)
 {
-	static bits extcpy[] = {
-		[WFull] = 0,
-		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
-		[Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
-		[Wsh] = BIT(Wsh) | BIT(Wsw),
-		[Wuh] = BIT(Wuh) | BIT(Wuw),
-		[Wsw] = BIT(Wsw),
-		[Wuw] = BIT(Wuw),
-	};
-	Op *op;
-	bits b;
 	Tmp *t;
+	Use *u;
+	Phi *p;
+	int b;
+	Ins *i;
+	Ref rc;
+	int64_t v;
 
-	if (i->op == Ocopy)
+	if (isconbits(fn, r, &v))
+	if (u64_wbits(v) <= wbits)
 		return 1;
-	op = &optab[i->op];
-	if (op->hasid && KBASE(i->cls) == 0)
-		return iscon(i->arg[1], op->idval, fn);
-	if (!isext(i->op) || rtype(r) != RTmp)
+	if (rtype(r) != RTmp)
 		return 0;
-	if (i->op == Oextsw || i->op == Oextuw)
-	if (i->cls == Kw)
-		return 1;
-
 	t = &fn->tmp[r.val];
-	assert(KBASE(t->cls) == 0);
-	if (i->cls == Kl && t->cls == Kw)
+	for (u = t->use; u < &t->use[t->nuse]; u++) {
+		switch (u->type) {
+		case UPhi:
+			p = u->u.phi;
+			if (p->visit)
+				continue;
+			p->visit = 1;
+			b = usewidthle(fn, p->to, wbits);
+			p->visit = 0;
+			if (b)
+				continue;
+			break;
+		case UIns:
+			i = u->u.ins;
+			assert(i != 0);
+			if (i->op == Ocopy)
+				if (usewidthle(fn, i->to, wbits))
+					continue;
+			if (isext(i->op)) {
+				if (EXTW[i->op - Oextsb] <= wbits)
+					continue;
+				else
+					if (usewidthle(fn, i->to, wbits))
+						continue;;
+			}
+			if (i->op == Oand) {
+				if (req(r, i->arg[0]))
+					rc = i->arg[1];
+				else {
+					assert(req(r, i->arg[1]));
+					rc = i->arg[0];
+				}
+				if (isconbits(fn, rc, &v))
+				if (u64_wbits(v) <= wbits)
+					continue;
+				break;
+			}
+			if (isstore(i->op))
+			if (req(r, i->arg[1]))
+			if (STW[i->op - Ostoreb] > wbits)
+				continue;
+			break;
+		default:
+			break;
+		}
 		return 0;
-	b = extcpy[t->width];
-	return (BIT(Wsb + (i->op-Oextsb)) & b) != 0;
+	}
+	return 1;
 }
 
-static Ref
-copyof(Ref r, Ref *cpy)
+static Phi*
+findphi(Fn *fn, uint bid, Ref to)
 {
-	if (rtype(r) == RTmp && !req(cpy[r.val], R))
-		return cpy[r.val];
-	return r;
+	Phi *p;
+	for (p = fn->rpo[bid]->phi; p; p = p->link)
+		if (req(p->to, to))
+			break;
+	assert(p);
+	return p;
 }
 
-/* detects a cluster of phis/copies redundant with 'r';
- * the algorithm is inspired by Section 3.2 of "Simple
- * and Efficient SSA Construction" by Braun M. et al.
- */
-static void
-phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn)
+static uint
+uint_min(uint v1, uint v2)
 {
-	Use **stk, *u, *u1;
-	uint nstk, a;
-	int t;
-	Ref r1;
-	Phi *p0;
-
-	bszero(ts);
-	bszero(as);
-	p0 = &(Phi){.narg = 0};
-	stk = *pstk;
-	nstk = 1;
-	stk[0] = &(Use){.type = UPhi, .u.phi = p};
-	while (nstk) {
-		u = stk[--nstk];
-		if (u->type == UIns && iscopy(u->u.ins, r, fn)) {
-			p = p0;
-			t = u->u.ins->to.val;
-		}
-		else if (u->type == UPhi) {
-			p = u->u.phi;
-			t = p->to.val;
-		}
-		else
-			continue;
-		if (bshas(ts, t))
-			continue;
-		bsset(ts, t);
-		for (a=0; a<p->narg; a++) {
-			r1 = copyof(p->arg[a], cpy);
-			if (req(r1, r))
-				continue;
-			if (rtype(r1) != RTmp)
-				return;
-			bsset(as, r1.val);
+	return v1 < v2 ? v1 : v2;
+}
+
+/* is the ref def a narrow value? */
+static int
+defwidthle(Fn *fn, Ref r, uint wbits)
+{
+	Tmp *t;
+	Phi *p;
+	Ins *i;
+	uint n;
+	int64_t v;
+	int x;
+
+	if (isconbits(fn, r, &v))
+	if (u64_wbits(v) <= wbits)
+		return 1;
+	if (rtype(r) != RTmp)
+		return 0;
+	t = &fn->tmp[r.val];
+	if (t->cls != Kw)
+		return 0;
+	i = t->def;
+	if (i == 0) {
+		/* phi def */
+		p = findphi(fn, t->bid, r);
+		if (p->visit)
+			return 1;
+		p->visit = 1;
+		for (n = 0; n < p->narg; n++)
+			if (!defwidthle(fn, p->arg[n], wbits)) {
+				p->visit = 0;
+				return 0;
+			}
+		p->visit = 0;
+		return 1;
+	}
+	/* ins def */
+	if (i->op == Ocopy)
+                return defwidthle(fn, i->arg[0], wbits);
+	if (i->op == Oshr || i->op == Osar) {
+		if (isconbits(fn, i->arg[1], &v))
+		if (0 < v && v <= 32) {
+			if (i->op == Oshr && 32-v <= wbits)
+				return 1;
+			if (0 <= v && v < 32 && wbits < 32)
+				return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v));
 		}
-		u = fn->tmp[t].use;
-		u1 = &u[fn->tmp[t].nuse];
-		vgrow(pstk, nstk+(u1-u));
-		stk = *pstk;
-		for (; u<u1; u++)
-			stk[nstk++] = u;
+		return defwidthle(fn, i->arg[0], wbits);
+	}
+	if (iscmp(i->op, &x, &x))
+		return wbits >= 1;
+	if (i->op == Oand)
+		return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits);
+	if (i->op == Oor || i->op == Oxor)
+		return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits);
+	if (isext(i->op)) {
+		if (EXTSIGNED[i->op - Oextsb])
+			return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb]));
+		if (EXTW[i->op - Oextsb] <= wbits)
+			return 1;
+		return defwidthle(fn, i->arg[0], wbits);
 	}
-	bsdiff(as, ts);
-	if (!bscount(as))
-		for (t=0; bsiter(ts, &t); t++)
-			cpy[t] = r;
+	return 0;
+}
+
+/* is the ref a boolean - 0, 1 - value? */
+int
+iswu1(Fn *fn, Ref r)
+{
+	return defwidthle(fn, r, 1);
 }
 
-static void
-subst(Ref *pr, Ref *cpy)
+static int
+isnarrowpar(Fn *fn, Ref r)
 {
-	assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R));
-	*pr = copyof(*pr, cpy);
+	Tmp *t;
+
+	if (rtype(r) != RTmp)
+		return 0;
+	t = &fn->tmp[r.val];
+	if (t->bid != fn->start->id || t->def == 0)
+		return 0;
+	return ispar(t->def->op);
 }
 
-/* requires use and dom, breaks use */
+/* Insert extub/extuh instructions in start for pars used only narrowly */
+/* needs use; breaks use */
 void
-copy(Fn *fn)
+narrowpars(Fn *fn)
 {
-	BSet ts[1], as[1];
-	Use **stk;
-	Phi *p, **pp;
-	Ins *i;
 	Blk *b;
-	uint n, a, eq;
-	Ref *cpy, r, r1;
-	int t;
-
-	bsinit(ts, fn->ntmp);
-	bsinit(as, fn->ntmp);
-	cpy = emalloc(fn->ntmp * sizeof cpy[0]);
-	stk = vnew(10, sizeof stk[0], PHeap);
-
-	/* 1. build the copy-of map */
-	for (n=0; n<fn->nblk; n++) {
-		b = fn->rpo[n];
-		for (p=b->phi; p; p=p->link) {
-			assert(rtype(p->to) == RTmp);
-			if (!req(cpy[p->to.val], R))
-				continue;
-			eq = 0;
-			r = R;
-			for (a=0; a<p->narg; a++)
-				if (p->blk[a]->id < n) {
-					r1 = copyof(p->arg[a], cpy);
-					if (req(r, R) || req(r, UNDEF))
-						r = r1;
-					if (req(r1, r) || req(r1, UNDEF))
-						eq++;
-				}
-			assert(!req(r, R));
-			if (rtype(r) == RTmp
-			&& !dom(fn->rpo[fn->tmp[r.val].bid], b))
-				cpy[p->to.val] = p->to;
-			else if (eq == p->narg)
-				cpy[p->to.val] = r;
-			else {
-				cpy[p->to.val] = p->to;
-				phisimpl(p, r, cpy, &stk, ts, as, fn);
-			}
-		}
-		for (i=b->ins; i<&b->ins[b->nins]; i++) {
-			assert(rtype(i->to) <= RTmp);
-			if (!req(cpy[i->to.val], R))
-				continue;
-			r = copyof(i->arg[0], cpy);
-			if (iscopy(i, r, fn))
-				cpy[i->to.val] = r;
-			else
-				cpy[i->to.val] = i->to;
-		}
-	}
+	int loop;
+	Ins *i, *ins;
+	uint npar, nins;
+	enum O extop;
+	Ref r;
 
-	/* 2. remove redundant phis/copies
-	 * and rewrite their uses */
-	for (b=fn->start; b; b=b->link) {
-		for (pp=&b->phi; (p=*pp);) {
-			r = cpy[p->to.val];
-			if (!req(r, p->to)) {
-				*pp = p->link;
-				continue;
-			}
-			for (a=0; a<p->narg; a++)
-				subst(&p->arg[a], cpy);
-			pp=&p->link;
-		}
-		for (i=b->ins; i<&b->ins[b->nins]; i++) {
-			r = cpy[i->to.val];
-			if (!req(r, i->to)) {
-				*i = (Ins){.op = Onop};
-				continue;
-			}
-			subst(&i->arg[0], cpy);
-			subst(&i->arg[1], cpy);
+	/* only useful for functions with loops */
+	loop = 0;
+	for (b = fn->start; b; b = b->link)
+		if (b->loop > 1) {
+			loop = 1;
+			break;
 		}
-		subst(&b->jmp.arg, cpy);
+	if (!loop)
+		return;
+
+	b = fn->start;
+	npar = 0;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++) {
+		if (!ispar(i->op))
+			break;
+		npar++;
 	}
 
-	if (debug['C']) {
-		fprintf(stderr, "\n> Copy information:");
-		for (t=Tmp0; t<fn->ntmp; t++) {
-			if (req(cpy[t], R)) {
-				fprintf(stderr, "\n%10s not seen!",
-					fn->tmp[t].name);
-			}
-			else if (!req(cpy[t], TMP(t))) {
-				fprintf(stderr, "\n%10s copy of ",
-					fn->tmp[t].name);
-				printref(cpy[t], fn, stderr);
+	if (npar == 0)
+		return;
+
+	nins = b->nins + npar;
+	ins = vnew(nins, sizeof ins[0], PFn); //alloc(nins * sizeof ins[0]);
+	memcpy(ins, b->ins, npar * sizeof ins[0]);
+	memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]);
+	b->ins = ins;
+	b->nins = nins;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++) {
+		if (!ispar(i->op))
+			break;
+		extop = Onop;
+		if (i->cls == Kw)
+			if (usewidthle(fn, i->to, 16)) {
+				if (usewidthle(fn, i->to, 8))
+					extop = Oextub;
+				else
+					extop = Oextuh;
 			}
+		if (extop == Onop) {
+			*(i+npar) = (Ins) {.op = Onop};
+		} else {
+			r = newtmp("vw", i->cls, fn);
+			*(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}};
+			i->to = r;
 		}
-		fprintf(stderr, "\n\n> After copy elimination:\n");
-		printfn(fn, stderr);
 	}
-	vfree(stk);
-	free(cpy);
+}
+
+/* used by GVN */
+Ref
+copyref(Fn *fn, Blk *b, Ins *i)
+{
+	static bits extcpy[] = {
+		[WFull] = 0,
+		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
+		[Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
+		[Wsh] = BIT(Wsh) | BIT(Wsw),
+		[Wuh] = BIT(Wuh) | BIT(Wuw),
+		[Wsw] = BIT(Wsw),
+		[Wuw] = BIT(Wuw),
+	};
+	bits bext;
+	Tmp *t;
+	int64_t v;
+	int is0;
+
+	if (i->op == Ocopy)
+		return i->arg[0];
+
+	/* op identity value */
+	if (optab[i->op].hasid)
+	if (KBASE(i->cls) == 0) /* integer only - fp NaN! */
+	if (req(i->arg[1], con01[optab[i->op].idval]))
+	if (!optab[i->op].cmpeqwl || iswu1(fn, i->arg[0]))
+		return i->arg[0];
+
+	/* idempotent op with identical args */
+	if (optab[i->op].idemp)
+	if (req(i->arg[0], i->arg[1]))
+		return i->arg[0];
+
+	/* integer cmp with identical args */
+	if (optab[i->op].cmpeqwl || optab[i->op].cmplgtewl)
+	if (req(i->arg[0], i->arg[1]))
+		return con01[optab[i->op].eqval];
+
+	/* cmpeq/ne 0 with 0/non-0 inference from dominating jnz */
+	if (optab[i->op].cmpeqwl)
+	if (req(i->arg[1], con01[0]))
+	if (is0non0(fn, b, i->arg[0], argcls(i,0), &is0))
+		return con01[optab[i->op].eqval^is0^1];
+
+	/* redundant and mask */
+	if (i->op == Oand)
+	if (isconbits(fn, i->arg[1], &v))
+	if (((v+1) & v) == 0) /* v == 2^N-1 */
+	if (defwidthle(fn, i->arg[0], u64_wbits(v)))
+		return i->arg[0];
+
+	if (!isext(i->op) || rtype(i->arg[0]) != RTmp)
+		return R;
+	if (i->op == Oextsw || i->op == Oextuw)
+	if (i->cls == Kw)
+		return i->arg[0];
+
+	t = &fn->tmp[i->arg[0].val];
+	assert(KBASE(t->cls) == 0);
+	if (i->cls == Kl && t->cls == Kw)
+		return R;
+	bext = extcpy[t->width];
+	if ((BIT(Wsb + (i->op-Oextsb)) & bext) != 0)
+		return i->arg[0];
+
+	if (!isnarrowpar(fn, i->arg[0]))
+	if (usewidthle(fn, i->to, EXTW[i->op - Oextsb]))
+		return i->arg[0];
+	if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb]))
+		return i->arg[0];
+
+	return R;
 }
author	Roland Paterson-Jones <[email protected]>	2024-06-19 16:48:11 +0200
committer	Quentin Carbonneaux <[email protected]>	2025-03-14 09:58:37 +0100
commit	c2ff93e75e5f6df8e1679120b18f0d5884deab2b (patch)
tree	b0f728874af29ea01e29f0575b4e5d8a3228a252 /copy.c
parent	9e36cbe4d8f8c9ff3d739ff9ead2a4a4988c0904 (diff)