Open
Description
Here I report a moderate performance difference, if you think it isn't important or there's nothing worth doing here, then close this issue.
This is a small Rust function foo:
fn foo() -> i64 {
const LIM: i64 = 30;
let mut result: i64 = 3;
let mut i = 4;
while i <= LIM {
let n: i64 = (1 << i) - i;
let n21: i64 = (n - 1) << 1;
let mut x: i64 = 0;
let mut y: i64 = n - 1;
let mut r: i64 = 0;
let mut count: i64 = 1;
let mut odd: i64 = 0;
let mut was: i64 = 0;
while x < y {
if (x << 1) < n21 - r {
r += x;
x += 1;
r += x;
count += 1;
} else {
r -= y;
y -= 1;
r -= y;
let new_was: i64;
if r < 0 {
r += x;
x += 1;
r += x;
new_was = 0;
} else {
new_was = 1;
};
was += new_was;
if (count & 1) != 0 {
result += (count - 2 * was) << 1;
} else {
result -= (count - 2 * was) << 1;
}
odd += count + (count & 1) - was;
was = new_was;
count = 1;
}
}
if x == y {
odd += 1 - was;
if count > 1 {
result += 4 * was - 1;
} else {
result += 1;
}
}
result += 2 * odd * (n - odd);
i += 2;
}
result
}
fn main() {
assert_eq!(foo(), 467178235146843549);
}
For testing I've translated it to (hopefully) equivalent C code (but at the end it puts() a string instead of asserting):
#include <stdio.h>
#include <stdint.h>
int64_t foo() {
const int64_t LIM = 30;
int64_t result = 3;
int64_t i = 4;
while (i <= LIM) {
const int64_t n = (1 << i) - i;
const int64_t n21 = (n - 1) << 1;
int64_t x = 0;
int64_t y = n - 1;
int64_t r = 0;
int64_t count = 1;
int64_t odd = 0;
int64_t was = 0;
while (x < y) {
if ((x << 1) < n21 - r) {
r += x;
x += 1;
r += x;
count += 1;
} else {
r -= y;
y -= 1;
r -= y;
int64_t new_was;
if (r < 0) {
r += x;
x += 1;
r += x;
new_was = 0;
} else {
new_was = 1;
}
was += new_was;
if ((count & 1) != 0) {
result += (count - 2 * was) << 1;
} else {
result -= (count - 2 * was) << 1;
}
odd += count + (count & 1) - was;
was = new_was;
count = 1;
}
}
if (x == y) {
odd += 1 - was;
if (count > 1) {
result += 4 * was - 1;
} else {
result += 1;
}
}
result += 2 * odd * (n - odd);
i += 2;
}
return result;
}
int main() {
printf("%s\n", foo() == 467178235146843549LL ? "true" : "false");
return 0;
}
I've compiled both versions with simple compiler switches, for the Rust code:
rustc --edition 2024 -C opt-level=3 -C strip=symbols test1.rs
Using rustc 1.89.0-nightly (4d08223 2025-05-31), the run-time I'm seeing is about 1.84 seconds.
For the C code:
gcc -O3 test2.c -o test2
Using g++ 16.0.0 20250601 (latest trunk), the run-time I'm seeing is about 1.60 seconds.
The asm generated by GCC:
foo:
pushq %r15
movl $4, %ecx
movl $3, %r10d
movl $11, %r9d
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
movl $12, %ebp
pushq %rbx
.L13:
leaq (%r9,%r9), %r13
xorl %r12d, %r12d
xorl %ebx, %ebx
movl $1, %edi
xorl %eax, %eax
xorl %edx, %edx
jmp .L8
.L18:
addq %rdx, %rax
addq $1, %rdx
movq %rsi, %rdi
addq %rdx, %rax
cmpq %rdx, %r9
jle .L17
.L8:
movq %r13, %r11
leaq (%rdx,%rdx), %r8
leaq 1(%rdi), %rsi
subq %rax, %r11
cmpq %r8, %r11
jg .L18
subq %r9, %rax
subq $1, %r9
subq %r9, %rax
js .L19
leaq 1(%r12), %r11
movl $1, %r12d
.L7:
leaq (%r11,%r11), %r15
movq %rdi, %r8
subq %r15, %r8
addq %r8, %r8
leaq (%r10,%r8), %r15
subq %r8, %r10
andl $1, %edi
movl $1, %edi
cmovne %r15, %r10
andq $-2, %rsi
subq %r11, %rsi
addq %rsi, %rbx
cmpq %rdx, %r9
jg .L8
.L17:
je .L9
subq %rbx, %rbp
imulq %rbx, %rbp
leaq (%r10,%rbp,2), %r10
.L10:
addq $2, %rcx
cmpq $32, %rcx
je .L1
.L20:
movl $1, %ebp
sall %cl, %ebp
movslq %ebp, %rbp
subq %rcx, %rbp
leaq -1(%rbp), %r9
jmp .L13
.L19:
addq %rdx, %rax
addq $1, %rdx
movq %r12, %r11
xorl %r12d, %r12d
addq %rdx, %rax
jmp .L7
.L9:
movq %r12, %rax
xorq $1, %rax
addq %rbx, %rax
subq %rax, %rbp
imulq %rax, %rbp
cmpq $1, %rdi
je .L11
leaq -1(%r10,%r12,4), %rax
addq $2, %rcx
leaq (%rax,%rbp,2), %r10
cmpq $32, %rcx
jne .L20
.L1:
popq %rbx
movq %r10, %rax
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
.L11:
leaq 1(%r10,%rbp,2), %r10
jmp .L10
And the asm generated by rustc:
foo::h1870fe44660fcb04:
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
movl $4, %ecx
movl $3, %eax
jmp .LBB0_1
.LBB0_9:
testq %rdi, %rdi
je .LBB0_10
xorl %esi, %esi
.LBB0_13:
subq %rsi, %rdx
imulq %rsi, %rdx
leaq (%rax,%rdx,2), %rax
leaq 2(%rcx), %rdx
cmpq $29, %rcx
movq %rdx, %rcx
jae .LBB0_14
.LBB0_1:
movl $1, %edx
shlq %cl, %rdx
subq %rcx, %rdx
leaq -1(%rdx), %rdi
cmpq $2, %rdx
jl .LBB0_9
leaq -2(,%rdx,2), %r9
movl $1, %r8d
xorl %ebx, %ebx
xorl %r11d, %r11d
xorl %esi, %esi
xorl %r10d, %r10d
jmp .LBB0_3
.LBB0_15:
addq %rbx, %r11
addq %rbx, %r11
incq %r11
incq %rbx
incq %r8
movq %rbx, %r14
cmpq %rdi, %r14
jge .LBB0_6
.LBB0_3:
leaq (%rbx,%rbx), %r14
movq %r9, %r15
subq %r11, %r15
cmpq %r15, %r14
jl .LBB0_15
subq %rdi, %r11
subq %rdi, %r11
leaq 1(%r11), %r12
leaq 1(%rbx), %r14
xorl %r15d, %r15d
testq %r12, %r12
leaq 1(%r11,%rbx), %r11
leaq 1(%rbx,%r11), %r11
cmovnsq %r12, %r11
cmovnsq %rbx, %r14
setns %r15b
addq %r15, %r10
movq %r8, %rbx
andq $1, %rbx
jne .LBB0_16
subq %r8, %rax
subq %r8, %rax
leaq (%rax,%r10,4), %rax
jmp .LBB0_17
.LBB0_16:
leaq (%rax,%r8,2), %rax
leaq (,%r10,4), %r12
subq %r12, %rax
.LBB0_17:
decq %rdi
addq %rsi, %r8
addq %rbx, %r8
subq %r10, %r8
movq %r15, %r10
movq %r8, %rsi
movl $1, %r8d
movq %r14, %rbx
cmpq %rdi, %r14
jl .LBB0_3
.LBB0_6:
jne .LBB0_13
subq %r10, %rsi
incq %rsi
cmpq $1, %r8
jle .LBB0_11
leaq (%rax,%r10,4), %rax
decq %rax
jmp .LBB0_13
.LBB0_10:
movl $1, %esi
.LBB0_11:
incq %rax
jmp .LBB0_13
.LBB0_14:
popq %rbx
popq %r12
popq %r14
popq %r15
retq
My asm skills aren't refined enough to spot where/how the gcc asm is more efficient than the rustc asm. Can you spot it?