Output code bloat when programming with a functional style
I have been wanting to experiment writing some code in Rust with a more “pure functional / Haskelly” style. To explain things more clearly, let’s say you have this type.
#[derive(Debug)]
struct Type {
field1 : u64,
field2 : u32,
field3 : Vec<u32>,
}
A very simple method to modify field2 would be written so in everyday Rust.
impl Type {
#[no_mangle]
#[inline(never)]
fn mut_update(&mut self, field2 : u32) -> &mut Self {
self.field2 = field2;
self
}
}
The &mut Self as return value is here so that one can chain such method calls. Now here is a more “pure functional” way of doing it.
impl Type {
#[no_mangle]
#[inline(never)]
fn func_update(self, field2 : u32) -> Self {
Type { field2 : field2, .. self }
}
}
And once optimized out, these two functions should be perfectly identical, because func_update takes the property of self and never returns it, so the memory can be safely reused.
So I tested it out with this basic code.
fn main() {
let mut var = Type { field1 : 42, field2 : 0, field3 : vec![12] };
let _ = var.mut_update(79);
let _other = var.func_update(112);
}
But no… This is the assembly generated for mut_update
mut_update:
.cfi_startproc
mov dword ptr [rdi + 32], 79
ret
And this is the one generated for func_update.
func_update:
.cfi_startproc
mov rax, qword ptr [rsi]
mov qword ptr [rdi], rax
mov dword ptr [rdi + 32], 112
mov rax, qword ptr [rsi + 24]
mov qword ptr [rdi + 24], rax
movups xmm0, xmmword ptr [rsi + 8]
movups xmmword ptr [rdi + 8], xmm0
mov rax, rdi
ret
As you can see, the whole memory occupied by self is copied to another location, a pointer to which is returned by the function. And here is the LLVM IR for these two functions.
; Function Attrs: noinline norecurse nounwind uwtable
define internal fastcc void @mut_update(%Type* nocapture dereferenceable(40)) unnamed_addr #0 {
start:
%1 = getelementptr inbounds %Type, %Type* %0, i64 0, i32 4
store i32 79, i32* %1, align 4
ret void
}
; Function Attrs: noinline nounwind uwtable
define internal fastcc void @func_update(%Type* noalias nocapture sret dereferenceable(40), %Type* noalias nocapture readonly dereferenceable(40)) unnamed_addr #1 {
start:
%self.sroa.0.0..sroa_idx = getelementptr inbounds %Type, %Type* %1, i64 0, i32 0
%self.sroa.0.0.copyload = load i64, i64* %self.sroa.0.0..sroa_idx, align 8
%self.sroa.4.0..sroa_idx = getelementptr inbounds %Type, %Type* %1, i64 0, i32 2
%self.sroa.4.0..sroa_cast2 = bitcast %"collections::vec::Vec<u32>"* %self.sroa.4.0..sroa_idx to i8*
%2 = getelementptr inbounds %Type, %Type* %0, i64 0, i32 0
store i64 %self.sroa.0.0.copyload, i64* %2, align 8
%3 = getelementptr inbounds %Type, %Type* %0, i64 0, i32 4
store i32 112, i32* %3, align 8
%self.sroa.4.8..sroa_idx = getelementptr inbounds %Type, %Type* %0, i64 0, i32 2
%self.sroa.4.8..sroa_cast = bitcast %"collections::vec::Vec<u32>"* %self.sroa.4.8..sroa_idx to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %self.sroa.4.8..sroa_cast, i8* %self.sroa.4.0..sroa_cast2, i64 24, i32 8, i1 false)
ret void
}
The “functional” variant is already bloated at this point, so the problem comes form rustc itself.
That’s the first problem. The other is in the main function: look at what happens between the call to the two functions.
mov dword ptr [rax], 12
mov qword ptr [rsp + 8], 42
mov dword ptr [rsp + 40], 0
mov qword ptr [rsp + 16], rax
mov qword ptr [rsp + 24], 1
mov qword ptr [rsp + 32], 1
lea rdi, [rsp + 8]
call mut_update
mov rax, qword ptr [rsp + 40]
mov qword ptr [rsp + 80], rax
movups xmm0, xmmword ptr [rsp + 8]
movups xmm1, xmmword ptr [rsp + 24]
movaps xmmword ptr [rsp + 64], xmm1
movaps xmmword ptr [rsp + 48], xmm0
lea rdi, [rsp + 96]
lea rsi, [rsp + 48]
call func_update
Yes, you read it right: the whole memory space containing var is being copied to another place before the next function is even called. So it is copied twice: immediately before the call and immediately after.
Here is the LLVM IR for the main function.
; Function Attrs: uwtable
define internal void @_ZN4opti4main17ha95f343356fbf6f4E() unnamed_addr #2 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
%_9 = alloca %Type, align 8
%_other = alloca %Type, align 8
%var = alloca %Type, align 8
%0 = bitcast %Type* %var to i8*
call void @llvm.lifetime.start(i64 40, i8* nonnull %0)
%1 = tail call i8* @__rust_allocate(i64 4, i64 4) #4
%2 = icmp eq i8* %1, null
br i1 %2, label %bb5.i, label %bb5
bb5.i: ; preds = %start
tail call void @_ZN5alloc3oom3oom17h5b02814f1abf9784E()
unreachable
bb5: ; preds = %start
%3 = bitcast i8* %1 to i32*
store i32 12, i32* %3, align 4
%4 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 0
store i64 42, i64* %4, align 8
%5 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 4
store i32 0, i32* %5, align 8
%_2.sroa.0.0..sroa_idx = getelementptr inbounds %Type, %Type* %var, i64 0, i32 2, i32 0, i32 0, i32 0, i32 0
%6 = bitcast i32** %_2.sroa.0.0..sroa_idx to i8**
store i8* %1, i8** %6, align 8
%_2.sroa.4.0..sroa_idx11 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 2, i32 0, i32 2
store i64 1, i64* %_2.sroa.4.0..sroa_idx11, align 8
%_2.sroa.5.0..sroa_idx13 = getelementptr inbounds %Type, %Type* %var, i64 0, i32 2, i32 2
store i64 1, i64* %_2.sroa.5.0..sroa_idx13, align 8
call fastcc void @mut_update(%Type* nonnull dereferenceable(40) %var)
%7 = bitcast %Type* %_other to i8*
call void @llvm.lifetime.start(i64 40, i8* nonnull %7)
%8 = bitcast %Type* %_9 to i8*
call void @llvm.lifetime.start(i64 40, i8* nonnull %8)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %8, i8* nonnull %0, i64 40, i32 8, i1 false)
call fastcc void @func_update(%Type* noalias nocapture nonnull sret dereferenceable(40) %_other, %Type* noalias nocapture nonnull dereferenceable(40) %_9)
call void @llvm.lifetime.end(i64 40, i8* nonnull %8)
%9 = getelementptr inbounds %Type, %Type* %_other, i64 0, i32 2, i32 0, i32 2
%10 = load i64, i64* %9, align 8
%not..i.i.i.i = icmp eq i64 %10, 0
br i1 %not..i.i.i.i, label %bb6, label %bb6.i.i.i.i
bb6.i.i.i.i: ; preds = %bb5
%11 = getelementptr inbounds %Type, %Type* %_other, i64 0, i32 2
%12 = shl i64 %10, 2
%13 = bitcast %"collections::vec::Vec<u32>"* %11 to i8**
%_3.sroa.0.0.copyload3.i1.i.i.i.i = load i8*, i8** %13, align 8, !alias.scope !1
tail call void @__rust_deallocate(i8* %_3.sroa.0.0.copyload3.i1.i.i.i.i, i64 %12, i64 4) #4
br label %bb6
bb6: ; preds = %bb6.i.i.i.i, %bb5
call void @llvm.lifetime.end(i64 40, i8* nonnull %7)
call void @llvm.lifetime.end(i64 40, i8* nonnull %0)
ret void
}
I know next to nothing about LLVM IR, so I don’t really understand what is going on in there. But I guess that this line is responsible for the unnecessary copy.
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %8, i8* nonnull %0, i64 40, i32 8, i1 false)
I hope you people find a solution to this, because it is a huge loss in processor time to do almost thrice as much memory access as would be needed. And even if it is not relevant on such small a code, with a bigger code base, it must surely bloat memory usage compared to C for instance.
Inlining fixes this, but you've explicitly disabled it. Why not rely on that when code like this is on a hot path for you?
Jonas Schievink at 2017-06-14 17:54:47
Actually, no. This is the assembly generated for the relevant section when letting the inlining happen. And adding some
println!(), otherwise, the whole code is optimized into/dev/null…mov dword ptr [rax], 12 mov qword ptr [rsp + 8], 42 mov qword ptr [rsp + 16], rax mov qword ptr [rsp + 24], 1 mov qword ptr [rsp + 32], 1 mov dword ptr [rsp + 40], 79 ; This is what `mut_update` becomes. ; Begin of `println!()`. lea rax, [rsp + 8] mov qword ptr [rsp + 88], rax lea r14, [rip + _ZN47_$LT$opti..Type$u20$as$u20$core..fmt..Debug$GT$3fmt17h06d64ac62be9bfa7E] mov qword ptr [rsp + 96], r14 lea rbx, [rip + ref.9] mov qword ptr [rsp + 120], rbx mov qword ptr [rsp + 128], 2 mov qword ptr [rsp + 136], 0 lea rax, [rsp + 88] mov qword ptr [rsp + 152], rax mov qword ptr [rsp + 160], 1 .Ltmp0: lea rdi, [rsp + 120] call _ZN3std2io5stdio6_print17h4d11beeeb2d08206E@PLT ; End of `println!()`. .Ltmp1: mov rax, qword ptr [rsp + 8] mov qword ptr [rsp + 48], rax mov dword ptr [rsp + 80], 112 mov rax, qword ptr [rsp + 32] mov qword ptr [rsp + 72], rax movups xmm0, xmmword ptr [rsp + 16] movups xmmword ptr [rsp + 56], xmm0 ; Yep. Everything is still copied for no reason.Moreover, a lot of code ends up in a library, where such functions will not be inlined, so we cannot always rely on inlining to solve our problems.
DominusCarnufex at 2017-06-14 18:22:25
Interesting, can you share the exact Rust code that causes this?
Moreover, a lot of code ends up in a library, where such functions will not be inlined, so we cannot always rely on inlining to solve our problems.
#[inline]on those functions allows inlining.Regardless, the proper fix is likely to augment the MIR optimizations to eliminate more moves (in case this isn't purely an LLVM issue).
Jonas Schievink at 2017-06-14 19:25:40
Also, not producing code (instead of relying on successive removal/optimization of it by LLVM) is what will make Rustc faster.
leonardo-m at 2017-06-14 19:30:35
@jonas-schievink : here is the full code of the inlined version.
#[derive(Debug)] struct Type { field1 : u64, field2 : u32, field3 : Vec<u32>, } impl Type { #[no_mangle] //#[inline(never)] fn mut_update(&mut self, field2 : u32) -> &mut Self { self.field2 = field2; self } #[no_mangle] //#[inline(never)] fn func_update(self, field2 : u32) -> Self { Type { field2 : field2, .. self } } } fn main() { let mut var = Type { field1 : 42, field2 : 0, field3 : vec![12] }; let _ = var.mut_update(79); println!("{:?}", var); let _other = var.func_update(112); println!("{:?}", _other); }As you can see, almost nothing changed compared to the first version.
DominusCarnufex at 2017-06-15 16:30:00
I have been trying to understand better what is going on, so I reduced the code as much as possible.
#![crate_type = "rlib"] struct Type { field1 : u64, field2 : u32, } impl Type { fn mut_update(&mut self, field2 : u32) -> &mut Self { self.field2 = field2; self } fn func_update(self, field2 : u32) -> Self { Type { field2 : field2, .. self } } }And when I use
rustc --emit mir, here is what I get.fn <impl at opti.rs:10:1: 23:2>::mut_update(_1: &mut Type, _2: u32) -> &mut Type { let mut _0: &mut Type; // return pointer scope 1 { let _3: &mut Type; // "self" in scope 1 at opti.rs:13:19: 13:28 let _4: u32; // "field2" in scope 1 at opti.rs:13:30: 13:36 } let mut _5: &mut Type; let mut _6: u32; bb0: { StorageLive(_3); // scope 0 at opti.rs:13:19: 13:28 _3 = _1; // scope 0 at opti.rs:13:19: 13:28 StorageLive(_4); // scope 0 at opti.rs:13:30: 13:36 _4 = _2; // scope 0 at opti.rs:13:30: 13:36 StorageLive(_5); // scope 1 at opti.rs:13:57: 16:6 StorageLive(_6); // scope 1 at opti.rs:14:23: 14:29 _6 = _4; // scope 1 at opti.rs:14:23: 14:29 ((*_3).1: u32) = _6; // scope 1 at opti.rs:14:9: 14:29 StorageDead(_6); // scope 1 at opti.rs:14:29: 14:29 _5 = _3; // scope 1 at opti.rs:15:9: 15:13 _0 = _5; // scope 1 at opti.rs:13:57: 16:6 StorageDead(_5); // scope 1 at opti.rs:16:6: 16:6 StorageDead(_4); // scope 0 at opti.rs:16:6: 16:6 StorageDead(_3); // scope 0 at opti.rs:16:6: 16:6 return; // scope 1 at opti.rs:16:6: 16:6 } } fn <impl at opti.rs:10:1: 23:2>::func_update(_1: Type, _2: u32) -> Type { let mut _0: Type; // return pointer scope 1 { let _3: Type; // "self" in scope 1 at opti.rs:20:20: 20:24 let _4: u32; // "field2" in scope 1 at opti.rs:20:26: 20:32 } let mut _5: u32; bb0: { StorageLive(_3); // scope 0 at opti.rs:20:20: 20:24 _3 = _1; // scope 0 at opti.rs:20:20: 20:24 StorageLive(_4); // scope 0 at opti.rs:20:26: 20:32 _4 = _2; // scope 0 at opti.rs:20:26: 20:32 StorageLive(_5); // scope 1 at opti.rs:21:25: 21:31 _5 = _4; // scope 1 at opti.rs:21:25: 21:31 _0 = Type { field1: (_3.0: u64), field2: _5 }; // scope 1 at opti.rs:21:9: 21:42 StorageDead(_5); // scope 1 at opti.rs:21:42: 21:42 StorageDead(_4); // scope 0 at opti.rs:22:6: 22:6 StorageDead(_3); // scope 0 at opti.rs:22:6: 22:6 return; // scope 1 at opti.rs:22:6: 22:6 } }The first thing that strikes me is: why is the code so bloated? If I understand well what happens, this code should be exactly equivalent.
fn <impl at opti.rs:10:1: 23:2>::mut_update(_1: &mut Type, _2: u32) -> &mut Type { let mut _0: &mut Type; // return pointer bb0: { ((*_1).1: u32) = _2; // scope 1 at opti.rs:14:9: 14:29 _0 = _1; // scope 1 at opti.rs:13:57: 16:6 return; // scope 1 at opti.rs:16:6: 16:6 } } fn <impl at opti.rs:10:1: 23:2>::func_update(_1: Type, _2: u32) -> Type { let mut _0: Type; // return pointer bb0: { _0 = Type { field1: (_1.0: u64), field2: _2 }; // scope 1 at opti.rs:21:9: 21:42 return; // scope 1 at opti.rs:22:6: 22:6 } }So if someone who knows Rust internals well comes here: is the full code useful in some way, and if yes, which one? Or should we try to make it a lot simpler?
Second thing, the simplified code makes it possible to imagine a way to solve my original problem, that is the useless data copying.
- If the argument
argis owned by the function. - If a new variable
varis created withargas basis. - If no read or write is performed on
argposteriorly to the creation ofvar. - Then, replace the creation of
varby a reuse ofarg, possibly modified.
This should solve the problem without introducing any new one.
Now, I am willing to try and create this optimization, but I need help because I have no idea where in the code I should introduce it.
DominusCarnufex at 2017-07-04 17:08:39
- If the argument
-Z mir-opt-level=3gives the following. Seemsfunc_updateis what you expected now. I think most of this will be due to the copy propagation pass.fn <impl at opti.rs:8:1: 17:2>::mut_update(_1: &mut Type, _2: u32) -> &mut Type { let mut _0: &mut Type; // return pointer scope 1 { let _3: &mut Type; // "self" in scope 1 at opti.rs:9:19: 9:28 } let mut _4: &mut Type; bb0: { StorageLive(_3); // scope 0 at opti.rs:9:19: 9:28 _3 = _1; // scope 0 at opti.rs:9:19: 9:28 nop; // scope 0 at opti.rs:9:30: 9:36 nop; // scope 0 at opti.rs:9:30: 9:36 StorageLive(_4); // scope 1 at opti.rs:9:57: 12:6 nop; // scope 1 at opti.rs:10:23: 10:29 nop; // scope 1 at opti.rs:10:23: 10:29 ((*_3).1: u32) = _2; // scope 1 at opti.rs:10:9: 10:29 nop; // scope 1 at opti.rs:10:29: 10:29 _4 = _3; // scope 1 at opti.rs:11:9: 11:13 _0 = _4; // scope 1 at opti.rs:9:57: 12:6 StorageDead(_4); // scope 1 at opti.rs:12:6: 12:6 nop; // scope 0 at opti.rs:12:6: 12:6 StorageDead(_3); // scope 0 at opti.rs:12:6: 12:6 return; // scope 1 at opti.rs:12:6: 12:6 } } fn <impl at opti.rs:8:1: 17:2>::func_update(_1: Type, _2: u32) -> Type { let mut _0: Type; // return pointer scope 1 { } bb0: { nop; // scope 0 at opti.rs:14:20: 14:24 nop; // scope 0 at opti.rs:14:20: 14:24 nop; // scope 0 at opti.rs:14:26: 14:32 nop; // scope 0 at opti.rs:14:26: 14:32 nop; // scope 1 at opti.rs:15:25: 15:31 nop; // scope 1 at opti.rs:15:25: 15:31 (_0.0: u64) = (_1.0: u64); // scope 1 at opti.rs:15:9: 15:42 (_0.1: u32) = _2; // scope 1 at opti.rs:15:9: 15:42 nop; // scope 1 at opti.rs:15:42: 15:42 nop; // scope 0 at opti.rs:16:6: 16:6 nop; // scope 0 at opti.rs:16:6: 16:6 return; // scope 1 at opti.rs:16:6: 16:6 } }Philip Craig at 2017-07-13 06:44:30
Small update, here's the code on godbolt: https://godbolt.org/z/BR-uGo
Looks like you still get the same asm today.
Steve Klabnik at 2019-04-05 21:24:13
This looks non-actionable to me. All those copies look necessary from a black-box perspective. The one at the caller could be elided making use of the fact that the argument is actually readonly (and nocapture), but that's a property of the implementation, not of the ABI.
Nikita Popov at 2021-03-13 20:16:50