diff options
Diffstat (limited to 'pkg/qbe/patch/0002-amd64-optimize-loading-0-into-registers.patch')
| -rw-r--r-- | pkg/qbe/patch/0002-amd64-optimize-loading-0-into-registers.patch | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/pkg/qbe/patch/0002-amd64-optimize-loading-0-into-registers.patch b/pkg/qbe/patch/0002-amd64-optimize-loading-0-into-registers.patch new file mode 100644 index 00000000..da5bcc1f --- /dev/null +++ b/pkg/qbe/patch/0002-amd64-optimize-loading-0-into-registers.patch @@ -0,0 +1,83 @@ +From 97d75808dc4e8eff6d15a56e6812af168dc265d7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com> +Date: Sun, 11 Jul 2021 19:19:12 -0300 +Subject: [PATCH] amd64: optimize loading 0 into registers + +Loading +0 into a floating point register can be done using pxor or +xorps instructions. Per [1], we went with pxor because it can run on all +vector ALU ports, even if it's one byte longer. + +Similarly, an integer register can be zeroed with xor, which has a +smaller encoding than mov with 0 immediate. + +To implement this, we special case fixarg to allow Ocopy when the +value is +0 for floating point, and change emitins to emit pxor/xor +when it encounters a copy from 0. + +Co-authored-by: Michael Forney <mforney@mforney.org> + +[1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976 +--- + amd64/emit.c | 12 ++++++++++++ + amd64/isel.c | 12 +++++++----- + 2 files changed, 19 insertions(+), 5 deletions(-) + +diff --git a/amd64/emit.c b/amd64/emit.c +index b8e9e8e..388b8b3 100644 +--- a/amd64/emit.c ++++ b/amd64/emit.c +@@ -450,6 +450,18 @@ emitins(Ins i, Fn *fn, FILE *f) + if (req(i.to, i.arg[0])) + break; + t0 = rtype(i.arg[0]); ++ if (t0 == RCon ++ && fn->con[i.arg[0].val].type == CBits ++ && fn->con[i.arg[0].val].bits.i == 0) { ++ if (isreg(i.to)) { ++ if (KBASE(i.cls) == 0) ++ emitf("xor%k %=, %=", &i, fn, f); ++ else ++ emitf("pxor %D=, %D=", &i, fn, f); ++ break; ++ } ++ i.cls = KWIDE(i.cls) ? Kl : Kw; ++ } + if (i.cls == Kl + && t0 == RCon + && fn->con[i.arg[0].val].type == CBits) { +diff --git a/amd64/isel.c b/amd64/isel.c +index 4181e26..d4f0b69 100644 +--- a/amd64/isel.c ++++ b/amd64/isel.c +@@ -69,7 +69,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn) + r1 = r0 = *r; + s = rslot(r0, fn); + op = i ? i->op : Ocopy; +- if (KBASE(k) == 1 && rtype(r0) == RCon) { ++ if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) { + /* load floating points from memory + * slots, they can't be used as + * immediates +@@ -84,13 +84,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn) + a.offset.label = intern(buf); + fn->mem[fn->nmem-1] = a; + } +- else if (op != Ocopy && k == Kl && noimm(r0, fn)) { ++ else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) { + /* load constants that do not fit in + * a 32bit signed integer into a +- * long temporary ++ * long temporary OR ++ * load positive zero into a floating ++ * point register + */ +- r1 = newtmp("isel", Kl, fn); +- emit(Ocopy, Kl, r1, r0, R); ++ r1 = newtmp("isel", k, fn); ++ emit(Ocopy, k, r1, r0, R); + } + else if (s != -1) { + /* load fast locals' addresses into +-- +2.34.1 + |
