Description
For some reason, the __addkf3
symbol linked by rustc says that 0x00000000000000000000000000000000 + 0x00000000000000000000000000000001 = 0x00000000000000000000000000000000
, when the answer should be 0x00000000000000000000000000000001
. Other symbols likely do the wrong thing here too.
Example:
$ powerpc64-linux-gnu-gcc add_test.c -o add_test.c.ppc64
$ rustc add_test.rs -o add_test.rs.ppc64 --target powerpc64-unknown-linux-gnu -Clinker=powerpc64-linux-gnu-gcc
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64
0000000000000000000000000000000000 0.000000
0000000000000000000000000000000001 0.000000
0000000000000000000000000000000001 0.000000
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64
[add_test.rs:12:5] a = 0x00000000000000000000000000000000
[add_test.rs:12:5] b = 0x00000000000000000000000000000001
[add_test.rs:14:5] c = 0x00000000000000000000000000000000
Our Rust code emits __addkf3
which calls the hardware f128 addition routine xsaddup
. The part I cannot explain is that the C version also emits __addkf3
which also calls xsaddup
, but somehow produces a correct result. I checked with GDB to make sure they are both hitting xsaddup
, so can't really explain the discrepancy.
Original notes at rust-lang/compiler-builtins#606 (comment), discussion on Zulip at https://rust-lang.zulipchat.com/#narrow/stream/122651-general/topic/f128.20system.20libraries.20noncompliant.20platforms/near/438486364.
Full code copied from rust-lang/compiler-builtins#606 (comment):
Rust code
#![feature(f128)]
#[no_mangle]
#[inline(never)]
fn add_entry(a: f128, b: f128) -> f128 {
a + b
}
fn main() {
let a = f128::from_bits(0x0);
let b = f128::from_bits(0x1);
dbg!(a, b);
let c = add_entry(a, b);
dbg!(c);
}
C code
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#define _Float128 __float128
typedef struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
uint64_t lower, upper;
#elif __BYTE_ORDER == __BIG_ENDIAN
uint64_t upper, lower;
#else
#error missing endian check
#endif
} __attribute__((aligned(_Alignof(_Float128)))) u128;
_Float128 __addkf3(_Float128, _Float128);
void f128_print(_Float128 val) {
u128 ival = *((u128 *)(&val));
printf("%#018" PRIx64 "%016" PRIx64 " %lf\n", ival.upper, ival.lower, (double)val);
}
_Float128 new_f128(uint64_t upper, uint64_t lower) {
u128 val = { .lower = lower, .upper = upper };
return *((_Float128 *)(&val));
}
_Float128 add_entry(_Float128 a, _Float128 b) {
#ifdef USE_ADDKF3
return __addkf3(a, b);
#else
return a + b;
#endif
}
int main() {
_Float128 a = new_f128(0x0000000000000000, 0x0000000000000000);
_Float128 b = new_f128(0x0000000000000000, 0x0000000000000001);
f128_print(a);
f128_print(b);
_Float128 c = add_entry(a, b);
f128_print(c);
return 0;
}
Assembly generated from Rust (incorrect result)
0000000000010d00 <.add_entry>:
10d00: 7c 08 02 a6 mflr r0
10d04: f8 21 ff 91 stdu r1,-112(r1)
10d08: f8 01 00 80 std r0,128(r1)
10d0c: 4b ff c6 95 bl d3a0 <00000143.plt_call.__addkf3>
10d10: e8 41 00 28 ld r2,40(r1)
10d14: 38 21 00 70 addi r1,r1,112
10d18: e8 01 00 10 ld r0,16(r1)
10d1c: 7c 08 03 a6 mtlr r0
10d20: 4e 80 00 20 blr
000000000000d3a0 <00000143.plt_call.__addkf3>:
d3a0: f8 41 00 28 std r2,40(r1)
d3a4: 3d 62 ff ff addis r11,r2,-1
d3a8: e9 8b 7f 58 ld r12,32600(r11)
d3ac: 7d 89 03 a6 mtctr r12
d3b0: e8 4b 7f 60 ld r2,32608(r11)
d3b4: 4e 80 04 20 bctr
0000000000056790 <.__addkf3_resolve>:
56790: 81 2d 8f 9c lwz r9,-28772(r13)
56794: 75 29 00 40 andis. r9,r9,64
56798: 41 82 00 18 beq 567b0 <.__addkf3_resolve+0x20>
5679c: e8 62 80 10 ld r3,-32752(r2)
567a0: 4e 80 00 20 blr
567a4: 60 00 00 00 nop
567a8: 60 00 00 00 nop
567ac: 60 42 00 00 ori r2,r2,0
567b0: e8 62 80 18 ld r3,-32744(r2)
567b4: 4e 80 00 20 blr
...
567c4: 60 00 00 00 nop
567c8: 60 00 00 00 nop
567cc: 60 42 00 00 ori r2,r2,0
000000000005e6e0 <.__addkf3_hw>:
5e6e0: fc 42 18 08 xsaddqp v2,v2,v3
5e6e4: 4e 80 00 20 blr
...
5e6f4: 60 00 00 00 nop
5e6f8: 60 00 00 00 nop
5e6fc: 60 00 00 00 nop
0000000000057050 <.__addkf3_sw>:
57050: fb 41 ff d0 std r26,-48(r1)
57054: fb 61 ff d8 std r27,-40(r1)
57058: fb 81 ff e0 std r28,-32(r1)
5705c: fb a1 ff e8 std r29,-24(r1)
57060: fb c1 ff f0 std r30,-16(r1)
57064: fb e1 ff f8 std r31,-8(r1)
57068: f8 21 ff 41 stdu r1,-192(r1)
5706c: 39 21 00 70 addi r9,r1,112
57070: 39 41 00 70 addi r10,r1,112
// ... full sw implementation
Assembly generated from C (correct result)
0000000010000af8 <.add_entry>:
10000af8: 7c 08 02 a6 mflr r0
10000afc: f8 01 00 10 std r0,16(r1)
10000b00: fb e1 ff f8 std r31,-8(r1)
10000b04: f8 21 ff 81 stdu r1,-128(r1)
10000b08: 7c 3f 0b 78 mr r31,r1
10000b0c: 39 20 00 30 li r9,48
10000b10: 39 5f 00 80 addi r10,r31,128
10000b14: 7c 4a 4f 99 stxvd2x vs34,r10,r9
10000b18: 39 20 00 40 li r9,64
10000b1c: 39 5f 00 80 addi r10,r31,128
10000b20: 7c 6a 4f 99 stxvd2x vs35,r10,r9
10000b24: 39 20 00 40 li r9,64
10000b28: 39 5f 00 80 addi r10,r31,128
10000b2c: 7c 6a 4e 99 lxvd2x vs35,r10,r9
10000b30: 39 20 00 30 li r9,48
10000b34: 39 5f 00 80 addi r10,r31,128
10000b38: 7c 4a 4e 99 lxvd2x vs34,r10,r9
10000b3c: 4b ff fb a5 bl 100006e0 <00000019.plt_call.__addkf3>
10000b40: e8 41 00 28 ld r2,40(r1)
10000b44: f0 02 14 96 xxmr vs0,vs34
10000b48: f0 40 04 91 xxmr vs34,vs0
10000b4c: 38 3f 00 80 addi r1,r31,128
10000b50: e8 01 00 10 ld r0,16(r1)
10000b54: 7c 08 03 a6 mtlr r0
10000b58: eb e1 ff f8 ld r31,-8(r1)
10000b5c: 4e 80 00 20 blr
10000b60: 00 00 00 00 .long 0x0
10000b64: 00 00 00 01 .long 0x1
10000b68: 80 01 00 01 lwz r0,1(r1)
0000000010000c40 <.__addkf3_resolve>:
10000c40: 81 2d 8f 9c lwz r9,-28772(r13)
10000c44: 75 29 00 40 andis. r9,r9,64
10000c48: 41 82 00 18 beq 10000c60 <.__addkf3_resolve+0x20>
10000c4c: e8 62 80 10 ld r3,-32752(r2)
10000c50: 4e 80 00 20 blr
10000c54: 60 00 00 00 nop
10000c58: 60 00 00 00 nop
10000c5c: 60 42 00 00 ori r2,r2,0
10000c60: e8 62 80 18 ld r3,-32744(r2)
10000c64: 4e 80 00 20 blr
...
10000c74: 60 00 00 00 nop
10000c78: 60 00 00 00 nop
10000c7c: 60 42 00 00 ori r2,r2,0
0000000010008b90 <.__addkf3_hw>:
10008b90: fc 42 18 08 xsaddqp v2,v2,v3
10008b94: 4e 80 00 20 blr
...
10008ba4: 60 00 00 00 nop
10008ba8: 60 00 00 00 nop
10008bac: 60 00 00 00 nop
0000000010001500 <.__addkf3_sw>:
10001500: fb 41 ff d0 std r26,-48(r1)
10001504: fb 61 ff d8 std r27,-40(r1)
10001508: fb 81 ff e0 std r28,-32(r1)
1000150c: fb a1 ff e8 std r29,-24(r1)
10001510: fb c1 ff f0 std r30,-16(r1)
10001514: fb e1 ff f8 std r31,-8(r1)
10001518: f8 21 ff 41 stdu r1,-192(r1)
1000151c: 39 21 00 70 addi r9,r1,112
10001520: 39 41 00 70 addi r10,r1,112
// ...
Activity
tgross35 commentedon May 14, 2024
cc @ecnelises
@rustbot label +F-f16_and_f128 +A-ABI +T-libs +C-bug -needs-triage
f128
rust-lang/compiler-builtins#606f16
andf128
float types #116909beetrees commentedon May 16, 2024
This is being caused by a calling convention mismatch; the Rust version passes/returns the
f128
s in GPRs, but the builtin expects them to be in vector registers. This is because the ABI off128
on PowerPC depends on whether thevsx
target feature has been enabled (the Rust target has it disabled by default): GCC gets around this by not supporting_Float128
when thevsx
target feature is disabled.tgross35 commentedon May 16, 2024
That is interesting. Any idea what is best to do here?
beetrees commentedon May 16, 2024
This is the equivalent of #116344 but for PowerPC; LLVM will generate builtins calls using whatever target features are enabled for the current function. For another example (compiler explorer)
When the above code is compiled for the
i586-unknown-linux-gnu
target, both functions will call the__trunctfhf2
builtin but thesoft_float
function will expect the result to be returned in theax
register and thehard_float
function will expect the result to be returned in thexmm0
register.The possible solutions I can think of are:
compiler-builtins
was used instead. We'd also have to prohibit linking with other Rust or C code that was compiled with the target feature in a different state, as that code would want to use the builtins with a different ABI. This prohibition would be extremely difficult to achieve if the system libc uses the relevant builtins (and was built with different relevant target features), although this is unlikely to be an issue forf16
andf128
.extern "C"
ABI of SIMD vector types depends on target features (tracking issue forabi_unsupported_vector_types
future-incompatibility lint) #116558. However, this would still leave the question of what to do when the relevant target feature is disabled but the system builtins have it enabled; enabling the target feature for the builtins calls would prevent the code running on a processor without that target feature, so in that case we'd still have to ship our own builtin fromcompiler-builtins
and prohibit linking like in option 1.Option 4 is probably the best solution here. Option 2/3 will probably be required for things like #116558, but that's a separate (but closely related) issue.
2 remaining items