rustc -C opt-level=3 generates bad assembly code for Vec by default
Compiling the following C++ snippet with clang++ -O3 and g++ -O3 (see here):
#include <vector>
unsigned foo() {
std::vector<unsigned> a;
a.push_back(314);
return a[0];
}
generates this assembly on x86_64:
foo(): # @foo()
push rax
mov edi, 4
call operator new(unsigned long)
mov rdi, rax
call operator delete(void*)
mov eax, 314
pop rcx
ret
(note: clang generates perfect assembly even with multiple push backs, the only thing that seems to trip it is a reallocation)
This snippet compiled with rustc --C opt-level=3 (see here):
pub fn foo() -> u32 {
let mut v: Vec<u32> = Vec::new();
v.push(0);
v[0]
}
generates the following assembly:
<alloc::raw_vec::RawVec<T, A>>::double:
push rbp
mov rbp, rsp
push r14
push rbx
sub rsp, 64
mov r14, rdi
mov rbx, qword ptr [r14 + 8]
test rbx, rbx
je .LBB0_6
lea rsi, [4*rbx]
lea rcx, [8*rbx]
mov rdi, qword ptr [r14]
lea r9, [rbp - 40]
mov edx, 4
mov r8d, 4
call __rust_realloc@PLT
test rax, rax
je .LBB0_4
add rbx, rbx
jmp .LBB0_3
.LBB0_6:
lea rdx, [rbp - 40]
mov edi, 16
mov esi, 4
call __rust_alloc@PLT
test rax, rax
je .LBB0_8
mov ebx, 4
.LBB0_3:
mov qword ptr [r14], rax
mov qword ptr [r14 + 8], rbx
add rsp, 64
pop rbx
pop r14
pop rbp
ret
.LBB0_4:
mov rax, qword ptr [rbp - 40]
movups xmm0, xmmword ptr [rbp - 32]
movaps xmmword ptr [rbp - 64], xmm0
mov qword ptr [rbp - 40], rax
movaps xmm0, xmmword ptr [rbp - 64]
jmp .LBB0_5
.LBB0_8:
movups xmm0, xmmword ptr [rbp - 32]
movaps xmmword ptr [rbp - 64], xmm0
movaps xmm0, xmmword ptr [rbp - 64]
movaps xmmword ptr [rbp - 80], xmm0
movaps xmm0, xmmword ptr [rbp - 80]
.LBB0_5:
movups xmmword ptr [rbp - 32], xmm0
lea rdi, [rbp - 40]
call <alloc::heap::Heap as alloc::allocator::Alloc>::oom
core::ptr::drop_in_place:
push rbp
mov rbp, rsp
mov rsi, qword ptr [rdi + 8]
test rsi, rsi
je .LBB1_1
mov rdi, qword ptr [rdi]
shl rsi, 2
mov edx, 4
pop rbp
jmp __rust_dealloc@PLT
.LBB1_1:
pop rbp
ret
<alloc::heap::Heap as alloc::allocator::Alloc>::oom:
push rbp
mov rbp, rsp
sub rsp, 32
mov rax, qword ptr [rdi + 16]
mov qword ptr [rbp - 16], rax
movups xmm0, xmmword ptr [rdi]
movaps xmmword ptr [rbp - 32], xmm0
lea rdi, [rbp - 32]
call __rust_oom@PLT
example::foo:
push rbp
mov rbp, rsp
push rbx
sub rsp, 24
mov qword ptr [rbp - 32], 4
xorps xmm0, xmm0
movups xmmword ptr [rbp - 24], xmm0
lea rdi, [rbp - 32]
call <alloc::raw_vec::RawVec<T, A>>::double
mov rdi, qword ptr [rbp - 32]
mov rax, qword ptr [rbp - 16]
mov dword ptr [rdi + 4*rax], 0
inc rax
mov qword ptr [rbp - 16], rax
je .LBB3_2
mov ebx, dword ptr [rdi]
mov rsi, qword ptr [rbp - 24]
test rsi, rsi
je .LBB3_6
shl rsi, 2
mov edx, 4
call __rust_dealloc@PLT
.LBB3_6:
mov eax, ebx
add rsp, 24
pop rbx
pop rbp
ret
.LBB3_2:
lea rdi, [rip + panic_bounds_check_loc.2]
xor esi, esi
xor edx, edx
call core::panicking::panic_bounds_check@PLT
mov rbx, rax
lea rdi, [rbp - 32]
call core::ptr::drop_in_place
mov rdi, rbx
call _Unwind_Resume@PLT
GCC_except_table3:
.byte 255
.byte 155
.asciz "\234"
.byte 3
.byte 26
.long .Ltmp29-.Lfunc_begin3
.long .Ltmp32-.Ltmp29
.long .Ltmp33-.Lfunc_begin3
.byte 0
.long .Ltmp32-.Lfunc_begin3
.long .Lfunc_end3-.Ltmp32
.long 0
.byte 0
str.1:
.ascii "/checkout/src/liballoc/vec.rs"
panic_bounds_check_loc.2:
.quad str.1
.quad 29
.long 1555
.long 10
DW.ref.rust_eh_personality:
.quad rust_eh_personality
I've tried adding -lto and -C panic=abort to rustc without much luck. I've also tried replacing [0] with unsafe { *v.get_unchecked(0) } without any luck. The only thing that makes it generate good assembly is using Vec::with_capacity(N) (see here):
pub fn foo() -> u32 {
let mut v: Vec<u32> = Vec::with_capacity(3);
v.push(7);
v.push(4);
v[1]
}
generates
example::foo:
push rbp
mov rbp, rsp
mov eax, 4
pop rbp
ret
This is the asm I am seeing:
foo: pushq %rsi subq $64, %rsp movq $4, 40(%rsp) vxorps %xmm0, %xmm0, %xmm0 vmovups %xmm0, 48(%rsp) leaq 40(%rsp), %rcx callq _ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h82e5d6919d342c2fE movq 40(%rsp), %rcx movq 56(%rsp), %rax movl $0, (%rcx,%rax,4) addq $1, %rax movq %rax, 56(%rsp) je .LBB3_2 movl (%rcx), %esi movq 48(%rsp), %rdx testq %rdx, %rdx je .LBB3_8 shlq $2, %rdx movl $4, %r8d callq __rust_dealloc .LBB3_8: movl %esi, %eax addq $64, %rsp popq %rsi retq .LBB3_2: leaq panic_bounds_check_loc.2(%rip), %rcx xorl %edx, %edx xorl %r8d, %r8d callq _ZN4core9panicking18panic_bounds_check17hbf9952bd7dce1212E ud2 .LBB3_4: movq %rax, %rcx callq rust_eh_unwind_resume ud2 .LBB3_9: movq %rax, %rsi leaq 40(%rsp), %rcx callq _ZN4core3ptr13drop_in_place17h731ac0064b2a28c1E movq %rsi, %rcx callq rust_eh_unwind_resume ud2leonardo-m at 2017-09-09 12:55:10
This is the asm I am seeing on the playground (link):
.text .intel_syntax noprefix .file "playground.cgu-0.rs" .section ".text.cold._ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE","ax",@progbits .p2align 4, 0x90 .type _ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE,@function _ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE: .cfi_startproc push r14 .Lcfi0: .cfi_def_cfa_offset 16 push rbx .Lcfi1: .cfi_def_cfa_offset 24 sub rsp, 72 .Lcfi2: .cfi_def_cfa_offset 96 .Lcfi3: .cfi_offset rbx, -24 .Lcfi4: .cfi_offset r14, -16 mov r14, rdi mov rbx, qword ptr [r14 + 8] test rbx, rbx je .LBB0_6 lea rsi, [4*rbx] lea rcx, [8*rbx] mov rdi, qword ptr [r14] lea r9, [rsp + 8] mov edx, 4 mov r8d, 4 call __rust_realloc@PLT test rax, rax je .LBB0_4 add rbx, rbx jmp .LBB0_3 .LBB0_6: lea rdx, [rsp + 8] mov edi, 16 mov esi, 4 call __rust_alloc@PLT test rax, rax je .LBB0_8 mov ebx, 4 .LBB0_3: mov qword ptr [r14], rax mov qword ptr [r14 + 8], rbx add rsp, 72 pop rbx pop r14 ret .LBB0_4: mov rax, qword ptr [rsp + 8] movups xmm0, xmmword ptr [rsp + 16] movaps xmmword ptr [rsp + 32], xmm0 mov qword ptr [rsp + 8], rax movaps xmm0, xmmword ptr [rsp + 32] jmp .LBB0_5 .LBB0_8: movups xmm0, xmmword ptr [rsp + 16] movaps xmmword ptr [rsp + 32], xmm0 movaps xmm0, xmmword ptr [rsp + 32] movaps xmmword ptr [rsp + 48], xmm0 movaps xmm0, xmmword ptr [rsp + 48] .LBB0_5: movups xmmword ptr [rsp + 16], xmm0 lea rdi, [rsp + 8] call _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E .Lfunc_end0: .size _ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE, .Lfunc_end0-_ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE .cfi_endproc .section .text._ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E,"ax",@progbits .p2align 4, 0x90 .type _ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E,@function _ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E: .cfi_startproc mov rsi, qword ptr [rdi + 8] test rsi, rsi je .LBB1_1 mov rdi, qword ptr [rdi] shl rsi, 2 mov edx, 4 jmp __rust_dealloc@PLT .LBB1_1: ret .Lfunc_end1: .size _ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E, .Lfunc_end1-_ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E .cfi_endproc .section ".text.cold._ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E","ax",@progbits .p2align 4, 0x90 .type _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E,@function _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E: .cfi_startproc sub rsp, 24 .Lcfi5: .cfi_def_cfa_offset 32 mov rax, qword ptr [rdi + 16] mov qword ptr [rsp + 16], rax movups xmm0, xmmword ptr [rdi] movaps xmmword ptr [rsp], xmm0 mov rdi, rsp call __rust_oom@PLT .Lfunc_end2: .size _ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E, .Lfunc_end2-_ZN61_$LT$alloc..heap..Heap$u20$as$u20$alloc..allocator..Alloc$GT$3oom17h8b5f820f2dd0e667E .cfi_endproc .section .text._ZN10playground4main17hf21e17cd5a16d5d9E,"ax",@progbits .p2align 4, 0x90 .type _ZN10playground4main17hf21e17cd5a16d5d9E,@function _ZN10playground4main17hf21e17cd5a16d5d9E: .Lfunc_begin0: .cfi_startproc .cfi_personality 155, DW.ref.rust_eh_personality .cfi_lsda 27, .Lexception0 push rbx .Lcfi6: .cfi_def_cfa_offset 16 sub rsp, 32 .Lcfi7: .cfi_def_cfa_offset 48 .Lcfi8: .cfi_offset rbx, -16 mov qword ptr [rsp + 8], 4 xorps xmm0, xmm0 movups xmmword ptr [rsp + 16], xmm0 .Ltmp0: lea rdi, [rsp + 8] call _ZN49_$LT$alloc..raw_vec..RawVec$LT$T$C$$u20$A$GT$$GT$6double17h1882b1047d8074ffE .Ltmp1: mov rdi, qword ptr [rsp + 8] mov rax, qword ptr [rsp + 24] mov dword ptr [rdi + 4*rax], 0 inc rax mov qword ptr [rsp + 24], rax je .LBB3_2 mov rsi, qword ptr [rsp + 16] test rsi, rsi je .LBB3_6 shl rsi, 2 mov edx, 4 call __rust_dealloc@PLT .LBB3_6: add rsp, 32 pop rbx ret .LBB3_2: .Ltmp2: lea rdi, [rip + panic_bounds_check_loc.2] xor esi, esi xor edx, edx call _ZN4core9panicking18panic_bounds_check17h78beadfd8229dc37E@PLT .Ltmp3: .LBB3_7: .Ltmp4: mov rbx, rax lea rdi, [rsp + 8] call _ZN4core3ptr13drop_in_place17h62d05f1818f0f9d8E mov rdi, rbx call _Unwind_Resume@PLT .Lfunc_end3: .size _ZN10playground4main17hf21e17cd5a16d5d9E, .Lfunc_end3-_ZN10playground4main17hf21e17cd5a16d5d9E .cfi_endproc .section .gcc_except_table,"a",@progbits .p2align 2 GCC_except_table3: .Lexception0: .byte 255 .byte 155 .asciz "\234" .byte 3 .byte 26 .long .Ltmp0-.Lfunc_begin0 .long .Ltmp3-.Ltmp0 .long .Ltmp4-.Lfunc_begin0 .byte 0 .long .Ltmp3-.Lfunc_begin0 .long .Lfunc_end3-.Ltmp3 .long 0 .byte 0 .p2align 2 .section .text.main,"ax",@progbits .globl main .p2align 4, 0x90 .type main,@function main: .cfi_startproc mov rax, rsi mov rcx, rdi lea rdi, [rip + _ZN10playground4main17hf21e17cd5a16d5d9E] mov rsi, rcx mov rdx, rax jmp _ZN3std2rt10lang_start17h573cecb903a42a26E@PLT .Lfunc_end4: .size main, .Lfunc_end4-main .cfi_endproc .type str.1,@object .section .rodata.str.1,"a",@progbits .p2align 4 str.1: .ascii "/checkout/src/liballoc/vec.rs" .size str.1, 29 .type panic_bounds_check_loc.2,@object .section .data.rel.ro.panic_bounds_check_loc.2,"aw",@progbits .p2align 3 panic_bounds_check_loc.2: .quad str.1 .quad 29 .long 1555 .long 10 .size panic_bounds_check_loc.2, 24 .hidden DW.ref.rust_eh_personality .weak DW.ref.rust_eh_personality .section .data.DW.ref.rust_eh_personality,"aGw",@progbits,DW.ref.rust_eh_personality,comdat .p2align 3 .type DW.ref.rust_eh_personality,@object .size DW.ref.rust_eh_personality, 8 DW.ref.rust_eh_personality: .quad rust_eh_personality .section ".note.GNU-stack","",@progbitsNot all of it is due to the vector code (it seems the play ground doesn't support creating an object file for code without main).
gnzlbg at 2017-09-09 13:01:01
One difference between using
with_capacityandVec::newis thatRawVec::doublewhich is called byVec::pushis annotated with#[cold]and#[inline(never)]which will prevent the compiler from optimizing it out (at least without lto). Normally preventing it from being inlined is probably ideal, but this is a bit of an edge case. Withwith_capacity, the code around the allocation is inlined, and the compiler is able to figure out that it's redundant and also avoids the call todoubleentirely since the cap was just set larger than the length.It looks like in the C++ version, manages optimize out the zeroing of length/capacity. The
doublecall probably prevents this in the rust version. There is also an bounds check, not sure why this isn't optimized out. Maybe the compiler can't prove that length field (which is part of Vec, not RawVec) doesn't change inRawVec::doubleMy (possibly incorrect) attempt at labelling what's happening:
... ; Function prologue (save stack pointer and reserve space) push rbp mov rbp, rsp push rbx sub rsp, 24 ; This sets the internal ptr to the (later) heap allocated data to 4 (The pointer is set to the alignment of the type initially if the vector is empty.) mov qword ptr [rbp - 32], 4 ; And this sets the both the capacity and length to 0 using vector instructions. xorps xmm0, xmm0 movups xmmword ptr [rbp - 24], xmm0 ; Load pointer to the RawVec to pass to the double function (I think) lea rdi, [rbp - 32] ; Call to capacity doubling function. call <alloc::raw_vec::RawVec<T, A>>::double ; Load ptr to heap memory into %rdi mov rdi, qword ptr [rbp - 32] ; Load length into %rax mov rax, qword ptr [rbp - 16] ; Set first element to 0 (I suppose it could be possible to avoid doing the 4*rax bit here since the length should always be 0 at this point.) mov dword ptr [rdi + 4*rax], 0 ; Increment length inc rax ; Store the length back to the length field in the vector instance mov qword ptr [rbp - 16], rax ; Jump to bounds check panic if incrementing length wraps to zero(I think) je .LBB3_2 ; Load value of the first element into %ebx mov ebx, dword ptr [rdi] ; Load the capacity into %rsi mov rsi, qword ptr [rbp - 24] ; Check if capacity == 0 test rsi, rsi ; Jump to end and skip deallocation if the capacity of the vector was 0. je .LBB3_6 ; Set up arguments to rust_dealloc (I think) shl rsi, 2 mov edx, 4 ; Deallocate vector memory call __rust_dealloc@PLT .LBB3_6: ; Load value from %eax into return register mov eax, ebx ; Function epilogue add rsp, 24 pop rbx pop rbp ret ...oyvindln at 2017-09-09 14:02:48
When using
::with_capacitythe Rust version:example::foo: push rbp mov rbp, rsp mov eax, 4 pop rbp retis way better than the C++ version
foo(): # @foo() push rax mov edi, 4 ;; ------------------------------------- ;; what's the point of this memory allocation? call operator new(unsigned long) mov rdi, rax call operator delete(void*) ;; ------------------------------------- mov eax, 314 pop rcx retIdeally the Rust versions with and without
::with_capacitywould beat the C++ version in both situations.gnzlbg at 2017-09-09 14:17:37
By altering Vec::push to:
let len = self.len; if len == self.buf.cap() { self.buf.double(); } unsafe { let end = self.as_mut_ptr().offset(len as isize); ptr::write(end, value); self.len = len + 1; }I got rid of the bounds check.
I then tried to remove #[inline(never)] from RawVec::double(), and lo and behold, the example with 0 resolved to this:
xorl %eax, %eax retqUsing other (non-0) values in push seem to work fine too.
I don't know whether that could have some negative in other cases though.
oyvindln at 2017-09-09 16:01:39
Is
RawVec::double()still[cold]in your case?gnzlbg at 2017-09-09 16:03:20
Yeah, I didn't remove that.
oyvindln at 2017-09-09 16:10:32
So... two questions:
-
why is
doublemarked#[inline(never)]? I understand the desire of making itcoldis to avoid the compiler from inlining it and trashing the instruction cache for the "rare" situation in which theVecgrows. However,#[inline(never)]is a hammer that prevents some optimizations for these other examples. Isn'tcoldenough? -
does
doubleactually double the size of aVecor does it use a memory-reuse-friendly growth factor like the one stated here and here or probably more relevant, at facebook (they are using jemalloc internally as well)? (in most vector implementations I knowdoubleis calledgrowinstead to denote that the growth factor is an implementation detail)
gnzlbg at 2017-09-09 16:16:35
-
Double is simply doubling. There is an issue about the growth strategy of vectors: #29931
oyvindln at 2017-09-09 16:40:43
why is double marked #[inline(never)]?
See #23670, reducing the size of
Vec::pushwas the motivation given.matthewjasper at 2017-09-09 17:38:49
Seems there is a bit of a tradeoff here.
oyvindln at 2017-09-09 17:51:35
Here's one silly benchmark:
I don't think the benchmark provided is silly. It shows that something wrong is going on with
#[cold]. Why is a cold function being inlined when it is not profitable to do so?It would be nice to know what happens in that benchmark w/o
#[inline(never)]in that case today, and check the assembly.
EDIT: maybe cold is not enough and we need to annotate the call to double as
unlikely? (using LLVM's branch prediction hints?)gnzlbg at 2017-09-09 17:52:54
Extend doesn't use push anymore, so that benchmark isn't all that useful for current rust.
EDIT: If I use push manually, the non-inlined version is significantly faster:
#[bench] fn x(b: &mut Bencher) { let mut v = Vec::with_capacity(100); b.iter(|| { for n in 0..100 { v.push(n); } v.truncate(0); } ); }with inline(never) test x ... bench: 202 ns/iter (+/- 5) without: test x ... bench: 334 ns/iter (+/- 29)oyvindln at 2017-09-09 18:04:59
Actually, completely optimizing away the allocation would technically not be correct, since that would strictly speaking be changing the behaviour of the function, as the allocation could theoretically fail and cause a panic/exception. So in other words, the C++ version having a memory allocation call is correct.
It might still be possible to improve the Rust implementation optimize down to something similar to the C++ one though.
oyvindln at 2017-09-09 20:37:07
The first change in my previous comment makes this benchmark significantly faster (I got about 120-130 ns, compared to around 200 for the current version.) Though, it increases the code size a small bit, so for functions where nothing is known about the length of the vector it might cause a slight performance increase.
I'm tinkering around to see if I can reproduce it with just using the assume intrinsic, I managed to get rid of the bounds check in the original example with that at least without any code size increase..
oyvindln at 2017-09-10 11:56:16
I'm tinkering around to see if I can reproduce it with just using the assume intrinsic,
There is an
llvm.expectintrinsic to tell LLVM that we expect the branch to not be taken in most cases.gnzlbg at 2017-09-10 12:10:47
I'm tinkering around to see if I can reproduce it with just using the assume intrinsic,
I managed to avoid the code size increase, EDIT: Apparently there was still one extra instruction. so at least I now have an improvement that avoids the bounds check and improves the benchmark (it seems llvm manages to merge the loop counter and length value after my changes.) I don't know if it's possible to reduce it further without letting double be inlined, but I'm not sure how to manage that without increasing the code size and slowing down things. (Using unlikely(), just prevented it from being inlined for the code in the OP, but not for a function that only did a push, so that wasn't very useful.)
~~I'm preparing a PR with my changes, so people can test if they are of any use.~~ EDIT: There seems to be a few tradeoffs between different ways of altering the push version. Not sure what cases it's best to optimize for. Putting push inside a tight loop on it's own may not be the most real-world relevant example since one would normally use
extendwhich avoids push entirely and is way faster as it can avoid checking the capacity inside the loop.oyvindln at 2017-09-10 18:30:20
@oyvindln how is getting rid of a potential panic "changing the behavior"? I mean, if the optimized vector doesn't fail with OOM when the unoptimized one would, is that really bad? Surely it doesn't affect algorithmic correctness… Memory allocation, OOM conditions, etc. are mostly implementation details (and consequently very chaotic on modern OSes) anyway.
If someone is trying to test OOM by allocating and destroying single-element vectors, s/he is better off calling
malloc()directly, no? And I fail to see any other case where removing an error condition via optimization could be called "changing observable behavior". Saying this would be akin to not allowing register allocation "because local variables are supposed to be on the stack". It does not quite seem reasonable to me, frankly.Árpád Goretity at 2017-09-17 13:09:23
@H2CO3 Ah sorry, I worded myself badly. I don't see a problem with optimizing out the allocations in practice (and normally that's what you would want), I was just pondering why the allocations weren't removed in the C++ version.
I tried various combinations of using the
assumeintrinsic calls on the length and on the capacity, ordering of the increment and the pointer write, and usingunlikelyon the inVec<T>::push()but it's a bit hard to say which combination would be the ideal one without a benchmark that is more relevant to real-world code. (Also using assume a lot is a bit iffy as it can cause problems if used incorrectly.)This was one compromise I came up with:
pub fn push(&mut self, value: T) { unsafe { // This will panic or abort if we would allocate > isize::MAX bytes // or if the length increment would overflow for zero-sized types. if unlikely(self.len == self.buf.cap()) { let len = self.len; self.buf.double(); // Let llvm know that the length didn't change // so if we push to a new vector of length 0, // we don't have to calculate the offset from // the start of the array. // This might be worth revisiting once #31681 is solved. // See also #44452. assume(self.len() == len); } let end = self.as_mut_ptr().offset(self.len as isize); // Copy len to a local before writing `value` to the vector. This seems to enable // llvm to merge the length and loop counter when writing to a fresh vector. let len = self.len; ptr::write(end, value); // Let llvm know that the length doesn't change in the write call. assume(self.len() == len); self.len += 1; // Let llvm know that len didn't overflow. // We can safely assume that there was no overflow, // since if len == usize::MAX, growing the vector would fail. // We already made the assumption that the invariant len <= capacity // holds when checked if we should grow the vector. // // Letting llvm know that there was no overflow can help llvm // avoid bounds checks after a push in some situations. assume(self.len() != 0); } }It does result in one extra instruction for a lone raw push call though. It's a bit harder to experiment with doing changes to
RawVec::doubleas it uses some rust internals so it's difficult to test as a local function.oyvindln at 2017-09-17 13:42:45
@oyvindln Oh, that does make sense. Sorry, I thought you were writing in favor of not optimizing away the allocation.
Árpád Goretity at 2017-09-17 14:39:44
@oyvindln
unlikely(self.len == self.buf.cap())this is what I had in mind, but I expected the code to look like this:
if unlikely(full()) { grow(); }and
growshould be marked#[cold], so that thecoldapplies to everything in the branch.gnzlbg at 2017-09-17 14:47:42
I know that
coldcalls are never inlined by llvm, but they just have a lower threshold in the MIR inliner. Is the MIR inliner the reason why it's conventional in the rust stdlib to mark functions as bothcoldandinline(never)whencoldalready prevents inlining in LLVM?Would dropping
coldfunctions for consideration in MIR inlining still prevent the double function from being inlined while still allowing LLVM to fully optimize the call? Even if it doesn't fix the issue we should probably preventcoldfunctions from being inlined for consistency with LLVM's behavior.Techcable at 2017-10-01 23:14:17
@Techcable MIR inlining is not currently enabled by default, and LLVM does inline
coldccfunctions (source). Also note that#[cold]translates to thecoldattribute, not thecoldcccalling convention, though that doesn't prevent inlining either.Hanna Kruppe at 2017-10-01 23:26:42
Thanks for the clarification @rkruppe, you'd think the LLVM docs would be right! Do you think using the
coldcccalling convention instead help prevent the issue? At the very least it'd help performance by helping reduce the call overhead.Techcable at 2017-10-02 21:15:56
On nightly the original code:
pub fn foo() -> u32 { let mut v: Vec<u32> = Vec::new(); v.push(0); v[0] }now generates this:
example::foo: xor eax, eax rethttps://godbolt.org/g/bo4WaM
Lena Schönburg at 2018-06-25 12:05:42
Nice! A second push still makes it trip: https://godbolt.org/g/TZ1Mbo =/
clang appears to handle a couple of
push_backs before tripping.gnzlbg at 2018-06-25 12:12:43
No idea if it'll help, but this sounds like the optimizer limitation that this C++ lightning talk is about: https://www.youtube.com/watch?v=s4wnuiCwTGU Dunno why Rust would fare worse at this.
Ixrec at 2018-07-02 00:02:50
The only thing that makes it generate good assembly is using
Vec::with_capacity(N)(see here):pub fn foo() -> u32 { let mut v: Vec<u32> = Vec::with_capacity(3); v.push(7); v.push(4); v[1] }generates
example::foo: push rbp mov rbp, rsp mov eax, 4 pop rbp retIt doesn't anymore, it has regressed from 1.42 to 1.43
Edit: Currently O2 results in the expected assembly, O3 in a large mess.
the8472 at 2020-06-06 05:10:16
It looks like going from 1.49.0 to 1.15.0 we now will generate the expected optimal code for any number of pushes when sufficient capacity is provided using
Vec::with_capacityand-Cpanic=abortis passed. But the allocations only fold away for a few pushes without-Cpanic=abort.Ben Kimock at 2021-02-09 04:39:16