From c2ff93e75e5f6df8e1679120b18f0d5884deab2b Mon Sep 17 00:00:00 2001 From: Roland Paterson-Jones Date: Wed, 19 Jun 2024 16:48:11 +0200 Subject: Global Value Numbering / Global Code Motion More or less as proposed in its ninth iteration with the addition of a gcmmove() functionality to restore coherent local schedules. Changes since RFC 8: Features: - generalization of phi 1/0 detection - collapse linear jmp chains before GVN; simplifies if-graph detection used in 0/non-0 value inference and if-elim... - infer 0/non-0 values from dominating blk jnz; eliminates redundant cmp eq/ne 0 and associated jnz/blocks, for example redundant null pointer checks (hare codebase likes this) - remove (emergent) empty if-then-else graphlets between GVN and GCM; improves GCM instruction placement, particularly cmps. - merge %addr =l add %addr1, N sequences - reduces tmp count, register pressure. - squash consecutive associative ops with constant args, e.g. t1 = add t, N ... t2 = add t2, M -> t2 = add t, N+M Bug Fixes: - remove "cmp eq/ne of non-identical RCon's " in copyref(). RCon's are not guaranteed to be dedup'ed, and symbols can alias. Codebase: - moved some stuff into cfg.c including blkmerge() - some refactoring in gvn.c - simplification of reassoc.c - always reassoc all cmp ops and Kl add %t, N. Better on coremark, smaller codebase. - minor simplification of movins() - use vins Testing - standard QBE, cproc, hare, harec, coremark [still have Rust build issues with latest roland] Benchmark - coremark is ~15%+ faster than master - hare "HARETEST_INCLUDE='slow' make check" ~8% faster (crypto::sha1::sha1_1gb is biggest obvious win - ~25% faster) Changes since RFC 7: Bug fixes: - remove isbad4gcm() in GVN/GCM - it is unsound due to different state at GVN vs GCM time; replace with "reassociation" pass after GCM - fix intra-blk use-before-def after GCM - prevent GVN from deduping trapping instructions cos GCM will not move them - remove cmp eq/ne identical arg copy detection for floating point, it is not valid for NaN - fix cges/cged flagged as commutative in ops.h instead of cnes/cned respectively; just a typo Minor features: - copy detection handles cmp le/lt/ge/gt with identical args - treat (integer) div/rem by non-zero constant as non-trapping - eliminate add N/sub N pairs in copy detection - maintain accurate tmp use in GVN; not strictly necessary but enables interim global state sanity checking - "reassociation" of trivial constant offset load/store addresses, and cmp ops with point-of-use in pass after GCM - normalise commutative op arg order - e.g. op con, tmp -> op tmp, con to simplify copy detection and GVN instruction dedup Codebase: - split out core copy detection and constant folding (back) out into copy.c, fold.c respectively; gvn.c was getting monolithic - generic support for instruction moving in ins.c - used by GCM and reassoc - new reassociation pass in reassoc.c - other minor clean-up/refactor Changes since RFC 6: - More ext elimination in GVN by examination of def and use bit width - elimination of redundant and mask by bit width examination - Incorporation of Song's patch Changes since RFC 5: - avoidance of "bad" candidates for GVN/GCM - trivial address offset calculations, and comparisons - more copy detection mostly around boolean values - allow elimination of unused load, alloc, trapping instructions - detection of trivial boolean v ? 1 : 0 phi patterns - bug fix for (removal of) "chg" optimisation in ins recreation - it was missing removal of unused instructions in some cases ifelim() between GVN and GCM; deeper nopunused() --- all.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 7 deletions(-) (limited to 'all.h') diff --git a/all.h b/all.h index a5a0163..ceb7520 100644 --- a/all.h +++ b/all.h @@ -191,6 +191,7 @@ enum { #define INRANGE(x, l, u) ((unsigned)(x) - l <= u - l) /* linear in x */ #define isstore(o) INRANGE(o, Ostoreb, Ostored) #define isload(o) INRANGE(o, Oloadsb, Oload) +#define isalloc(o) INRANGE(o, Oalloc4, Oalloc16) #define isext(o) INRANGE(o, Oextsb, Oextuw) #define ispar(o) INRANGE(o, Opar, Opare) #define isarg(o) INRANGE(o, Oarg, Oargv) @@ -214,8 +215,15 @@ struct Op { char *name; short argcls[2][4]; uint canfold:1; - uint hasid:1; - uint idval:1; /* identity value 0/1 */ + uint hasid:1; /* op identity value? */ + uint idval:1; /* identity value 0/1 */ + uint commutes:1; /* commutative op? */ + uint assoc:1; /* associative op? */ + uint idemp:1; /* idempotent op? */ + uint cmpeqwl:1; /* Kl/Kw cmp eq/ne? */ + uint cmplgtewl:1; /* Kl/Kw cmp lt/gt/le/ge? */ + uint eqval:1; /* 1 for eq; 0 for ne */ + uint ispinned:1; /* GCM pinned op? */ }; struct Ins { @@ -230,7 +238,8 @@ struct Phi { Ref *arg; Blk **blk; uint narg; - int cls; + short cls; + uint visit:1; Phi *link; }; @@ -251,6 +260,7 @@ struct Blk { Blk *idom; Blk *dom, *dlink; + int domdpth; Blk **fron; uint nfron; @@ -344,6 +354,9 @@ struct Tmp { Wuw } width; int visit; + uint gcmbid; + uint gcminsn; // TODO get rid + uint gcmdefinsn; // TODO get rid }; struct Con { @@ -450,6 +463,20 @@ struct Dat { char isstr; }; +typedef struct InsLoc InsLoc; + +struct InsLoc { + uint bid; + uint insn; +}; + +typedef struct InsMov InsMov; + +struct InsMov { + InsLoc from; + InsLoc to; +}; + /* main.c */ extern Target T; extern char debug['Z'+1]; @@ -476,6 +503,7 @@ char *str(uint32_t); int argcls(Ins *, int); int isreg(Ref); int iscmp(int, int *, int *); +int invcmpwl(int); void emit(int, int, Ref, Ref, Ref); void emiti(Ins); void idup(Blk *, Ins *, ulong); @@ -484,16 +512,19 @@ int cmpop(int); int cmpneg(int); int clsmerge(short *, short); int phicls(int, Tmp *); +uint phiargn(Phi *, Blk *); +Ref phiarg(Phi *, Blk *); Ref newtmp(char *, int, Fn *); void chuse(Ref, int, Fn *); int symeq(Sym, Sym); Ref newcon(Con *, Fn *); Ref getcon(int64_t, Fn *); int addcon(Con *, Con *, int); +int isconbits(Fn *fn, Ref r, int64_t *v); +int istmpconbits(Fn *fn, Ins *i, int64_t *v); void salloc(Ref, Ref, Fn *); void dumpts(BSet *, Tmp *, FILE *); void runmatch(uchar *, Num *, Ref, Ref *); - void bsinit(BSet *, uint); void bszero(BSet *); uint bscount(BSet *); @@ -525,7 +556,6 @@ void elimsb(Fn *); /* cfg.c */ Blk *newblk(void); -void edgedel(Blk *, Blk **); void fillpreds(Fn *); void fillcfg(Fn *); void filldom(Fn *); @@ -533,8 +563,20 @@ int sdom(Blk *, Blk *); int dom(Blk *, Blk *); void fillfron(Fn *); void loopiter(Fn *, void (*)(Blk *, Blk *)); +void filldomdpth(Fn *); +Blk *lca(Blk *, Blk *); void fillloop(Fn *); void simpljmp(Fn *); +void replacepreds(Blk *, Blk *, Blk *); +void killblks(Fn *); +void blkmerge(Fn *); +int ifgraph(Blk *, Blk **, Blk **, Blk **); +int ifjoin(Blk *, Blk **, Blk **, Blk **); +int emptyblk(Blk *); +void ifelim(Fn *); +void clrbvisit(Fn *); +int reaches(Fn *,Blk *, Blk *); +int reachesnotvia(Fn *, Blk *, Blk *, Blk *); /* mem.c */ void promote(Fn *); @@ -552,15 +594,36 @@ int storesz(Ins *); void loadopt(Fn *); /* ssa.c */ +void adduse(Tmp *, int, Blk *, ...); void filluse(Fn *); void ssa(Fn *); void ssacheck(Fn *); /* copy.c */ -void copy(Fn *); +int iswu1(Fn *, Ref); +void narrowpars(Fn *); +Ref copyref(Fn *, Blk *, Ins *); /* fold.c */ -void fold(Fn *); +Ref foldref(Fn *, Ins *); + +/* gvn.c */ +extern Ref con01[2]; +int is0non0(Fn *, Blk *, Ref, int, int *); +void gvn(Fn *); + +/* gcm.c */ +int isfixed(Fn *, Ins *); +void gcm(Fn *); + +/* ins.c */ +void addins(Ins **, uint *, Ins *); +void addbins(Blk *, Ins **, uint *); +void nopunused(Fn *); +void movins(Fn *, InsMov *, uint, int); + +/* reassoc.c */ +void reassoc(Fn *); /* simpl.c */ void simpl(Fn *); -- cgit v1.2.3