summaryrefslogtreecommitdiff
path: root/pkg/qbe/patch/0001-amd64-optimize-loading-0-into-registers.patch
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/qbe/patch/0001-amd64-optimize-loading-0-into-registers.patch')
-rw-r--r--pkg/qbe/patch/0001-amd64-optimize-loading-0-into-registers.patch83
1 files changed, 0 insertions, 83 deletions
diff --git a/pkg/qbe/patch/0001-amd64-optimize-loading-0-into-registers.patch b/pkg/qbe/patch/0001-amd64-optimize-loading-0-into-registers.patch
deleted file mode 100644
index e7d5d5b5..00000000
--- a/pkg/qbe/patch/0001-amd64-optimize-loading-0-into-registers.patch
+++ /dev/null
@@ -1,83 +0,0 @@
-From 55b93f727cbad62a13dce0136077b0ffb47b90d7 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com>
-Date: Sun, 11 Jul 2021 19:19:12 -0300
-Subject: [PATCH] amd64: optimize loading 0 into registers
-
-Loading +0 into a floating point register can be done using pxor or
-xorps instructions. Per [1], we went with pxor because it can run on all
-vector ALU ports, even if it's one byte longer.
-
-Similarly, an integer register can be zeroed with xor, which has a
-smaller encoding than mov with 0 immediate.
-
-To implement this, we special case fixarg to allow Ocopy when the
-value is +0 for floating point, and change emitins to emit pxor/xor
-when it encounters a copy from 0.
-
-Co-authored-by: Michael Forney <mforney@mforney.org>
-
-[1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976
----
- amd64/emit.c | 12 ++++++++++++
- amd64/isel.c | 12 +++++++-----
- 2 files changed, 19 insertions(+), 5 deletions(-)
-
-diff --git a/amd64/emit.c b/amd64/emit.c
-index 51d1a5c..a3e72e6 100644
---- a/amd64/emit.c
-+++ b/amd64/emit.c
-@@ -458,6 +458,18 @@ emitins(Ins i, Fn *fn, FILE *f)
- if (req(i.to, i.arg[0]))
- break;
- t0 = rtype(i.arg[0]);
-+ if (t0 == RCon
-+ && fn->con[i.arg[0].val].type == CBits
-+ && fn->con[i.arg[0].val].bits.i == 0) {
-+ if (isreg(i.to)) {
-+ if (KBASE(i.cls) == 0)
-+ emitf("xor%k %=, %=", &i, fn, f);
-+ else
-+ emitf("pxor %D=, %D=", &i, fn, f);
-+ break;
-+ }
-+ i.cls = KWIDE(i.cls) ? Kl : Kw;
-+ }
- if (i.cls == Kl
- && t0 == RCon
- && fn->con[i.arg[0].val].type == CBits) {
-diff --git a/amd64/isel.c b/amd64/isel.c
-index e29c8bf..4bec2e1 100644
---- a/amd64/isel.c
-+++ b/amd64/isel.c
-@@ -85,7 +85,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
- r1 = r0 = *r;
- s = rslot(r0, fn);
- op = i ? i->op : Ocopy;
-- if (KBASE(k) == 1 && rtype(r0) == RCon) {
-+ if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) {
- /* load floating points from memory
- * slots, they can't be used as
- * immediates
-@@ -99,13 +99,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
- a.offset.sym.id = intern(buf);
- fn->mem[fn->nmem-1] = a;
- }
-- else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
-+ else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) {
- /* load constants that do not fit in
- * a 32bit signed integer into a
-- * long temporary
-+ * long temporary OR
-+ * load positive zero into a floating
-+ * point register
- */
-- r1 = newtmp("isel", Kl, fn);
-- emit(Ocopy, Kl, r1, r0, R);
-+ r1 = newtmp("isel", k, fn);
-+ emit(Ocopy, k, r1, r0, R);
- }
- else if (s != -1) {
- /* load fast locals' addresses into
---
-2.42.0
-