summaryrefslogtreecommitdiff
path: root/pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch')
-rw-r--r--pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch83
1 files changed, 83 insertions, 0 deletions
diff --git a/pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch b/pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch
new file mode 100644
index 00000000..07aa7e6b
--- /dev/null
+++ b/pkg/qbe/patch/0005-amd64-optimize-loading-0-into-registers.patch
@@ -0,0 +1,83 @@
+From a11da13e22a694f8fe4a81d894d433f50ce4af6b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com>
+Date: Sun, 11 Jul 2021 19:19:12 -0300
+Subject: [PATCH] amd64: optimize loading 0 into registers
+
+Loading +0 into a floating point register can be done using pxor or
+xorps instructions. Per [1], we went with pxor because it can run on all
+vector ALU ports, even if it's one byte longer.
+
+Similarly, an integer register can be zeroed with xor, which has a
+smaller encoding than mov with 0 immediate.
+
+To implement this, we special case fixarg to allow Ocopy when the
+value is +0 for floating point, and change emitins to emit pxor/xor
+when it encounters a copy from 0.
+
+Co-authored-by: Michael Forney <mforney@mforney.org>
+
+[1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976
+---
+ amd64/emit.c | 12 ++++++++++++
+ amd64/isel.c | 12 +++++++-----
+ 2 files changed, 19 insertions(+), 5 deletions(-)
+
+diff --git a/amd64/emit.c b/amd64/emit.c
+index a888000..7aeeff5 100644
+--- a/amd64/emit.c
++++ b/amd64/emit.c
+@@ -443,6 +443,18 @@ emitins(Ins i, Fn *fn, FILE *f)
+ if (req(i.to, i.arg[0]))
+ break;
+ t0 = rtype(i.arg[0]);
++ if (t0 == RCon
++ && fn->con[i.arg[0].val].type == CBits
++ && fn->con[i.arg[0].val].bits.i == 0) {
++ if (isreg(i.to)) {
++ if (KBASE(i.cls) == 0)
++ emitf("xor%k %=, %=", &i, fn, f);
++ else
++ emitf("pxor %D=, %D=", &i, fn, f);
++ break;
++ }
++ i.cls = KWIDE(i.cls) ? Kl : Kw;
++ }
+ if (i.cls == Kl
+ && t0 == RCon
+ && fn->con[i.arg[0].val].type == CBits) {
+diff --git a/amd64/isel.c b/amd64/isel.c
+index 607c176..1c902f5 100644
+--- a/amd64/isel.c
++++ b/amd64/isel.c
+@@ -69,7 +69,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
+ r1 = r0 = *r;
+ s = rslot(r0, fn);
+ op = i ? i->op : Ocopy;
+- if (KBASE(k) == 1 && rtype(r0) == RCon) {
++ if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) {
+ /* load floating points from memory
+ * slots, they can't be used as
+ * immediates
+@@ -84,13 +84,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
+ a.offset.label = intern(buf);
+ fn->mem[fn->nmem-1] = a;
+ }
+- else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
++ else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) {
+ /* load constants that do not fit in
+ * a 32bit signed integer into a
+- * long temporary
++ * long temporary OR
++ * load positive zero into a floating
++ * point register
+ */
+- r1 = newtmp("isel", Kl, fn);
+- emit(Ocopy, Kl, r1, r0, R);
++ r1 = newtmp("isel", k, fn);
++ emit(Ocopy, k, r1, r0, R);
+ }
+ else if (s != -1) {
+ /* load fast locals' addresses into
+--
+2.32.0
+