1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
From 55b93f727cbad62a13dce0136077b0ffb47b90d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89rico=20Nogueira?= <erico.erc@gmail.com>
Date: Sun, 11 Jul 2021 19:19:12 -0300
Subject: [PATCH] amd64: optimize loading 0 into registers
Loading +0 into a floating point register can be done using pxor or
xorps instructions. Per [1], we went with pxor because it can run on all
vector ALU ports, even if it's one byte longer.
Similarly, an integer register can be zeroed with xor, which has a
smaller encoding than mov with 0 immediate.
To implement this, we special case fixarg to allow Ocopy when the
value is +0 for floating point, and change emitins to emit pxor/xor
when it encounters a copy from 0.
Co-authored-by: Michael Forney <mforney@mforney.org>
[1] https://stackoverflow.com/questions/39811577/does-using-mix-of-pxor-and-xorps-affect-performance/39828976
---
amd64/emit.c | 12 ++++++++++++
amd64/isel.c | 12 +++++++-----
2 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/amd64/emit.c b/amd64/emit.c
index 51d1a5c..a3e72e6 100644
--- a/amd64/emit.c
+++ b/amd64/emit.c
@@ -458,6 +458,18 @@ emitins(Ins i, Fn *fn, FILE *f)
if (req(i.to, i.arg[0]))
break;
t0 = rtype(i.arg[0]);
+ if (t0 == RCon
+ && fn->con[i.arg[0].val].type == CBits
+ && fn->con[i.arg[0].val].bits.i == 0) {
+ if (isreg(i.to)) {
+ if (KBASE(i.cls) == 0)
+ emitf("xor%k %=, %=", &i, fn, f);
+ else
+ emitf("pxor %D=, %D=", &i, fn, f);
+ break;
+ }
+ i.cls = KWIDE(i.cls) ? Kl : Kw;
+ }
if (i.cls == Kl
&& t0 == RCon
&& fn->con[i.arg[0].val].type == CBits) {
diff --git a/amd64/isel.c b/amd64/isel.c
index e29c8bf..4bec2e1 100644
--- a/amd64/isel.c
+++ b/amd64/isel.c
@@ -85,7 +85,7 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
r1 = r0 = *r;
s = rslot(r0, fn);
op = i ? i->op : Ocopy;
- if (KBASE(k) == 1 && rtype(r0) == RCon) {
+ if (KBASE(k) == 1 && rtype(r0) == RCon && fn->con[r0.val].bits.i != 0) {
/* load floating points from memory
* slots, they can't be used as
* immediates
@@ -99,13 +99,15 @@ fixarg(Ref *r, int k, Ins *i, Fn *fn)
a.offset.sym.id = intern(buf);
fn->mem[fn->nmem-1] = a;
}
- else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
+ else if (op != Ocopy && ((k == Kl && noimm(r0, fn)) || (KBASE(k) == 1 && rtype(r0) == RCon))) {
/* load constants that do not fit in
* a 32bit signed integer into a
- * long temporary
+ * long temporary OR
+ * load positive zero into a floating
+ * point register
*/
- r1 = newtmp("isel", Kl, fn);
- emit(Ocopy, Kl, r1, r0, R);
+ r1 = newtmp("isel", k, fn);
+ emit(Ocopy, k, r1, r0, R);
}
else if (s != -1) {
/* load fast locals' addresses into
--
2.42.0
|