Identity Result match mapping should optimize to an identity function
This was isolated by @stjepang in #37939
This program: (playground)
pub fn id_result(a: Result<u64, i64>) -> Result<u64, i64> {
match a {
Ok(x) => Ok(x),
Err(y) => Err(y),
}
}
Should optimize to something that just copies the input to the output, with no conditionals.
Output on rustc 1.15.0-nightly (daf8c1dfc 2016-12-05) using -Copt-level=3 --emit=llvm-ir is this:
; ModuleID = 'id_result.cgu-0.rs'
source_filename = "id_result.cgu-0.rs"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%"core::result::Result<u64, i64>" = type { i64, [0 x i64], [1 x i64] }
; Function Attrs: norecurse nounwind uwtable
define void @id_result(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture readonly dereferenceable(16)) unnamed_addr #0 {
entry-block:
%a.sroa.0.0..sroa_idx = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %1, i64 0, i32 0
%a.sroa.0.0.copyload = load i64, i64* %a.sroa.0.0..sroa_idx, align 8
%a.sroa.4.0..sroa_idx2 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %1, i64 0, i32 2, i64 0
%a.sroa.4.0.copyload = load i64, i64* %a.sroa.4.0..sroa_idx2, align 8
%2 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %0, i64 0, i32 0
%not.switch = icmp ne i64 %a.sroa.0.0.copyload, 0
%. = zext i1 %not.switch to i64
store i64 %., i64* %2, align 8
%3 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %0, i64 0, i32 2, i64 0
store i64 %a.sroa.4.0.copyload, i64* %3, align 8
ret void
}
attributes #0 = { norecurse nounwind uwtable }
Just leaving a note: This fix in LLVM will probably resolve the issue.
Deleted user at 2016-12-15 18:45:07
copy propagation in MIR for function arguments (added in #38332) can cause this to optimize correctly, at least for
Result<u64, i64>(and not forResult<u32, i32>). This optimization is never used by default though.the diff in unoptimized IR just points to the range metadata ending up somewhere it's not removed.
bluss at 2016-12-16 22:47:44
So according to the issue @stjepang notes, it looks like this may have been merged into LLVM trunk on Mar 22 2017; in https://reviews.llvm.org/rL298540. It's possible we could backport this? I'm not sure, but this could be https://github.com/rust-lang/rust/pull/40914.
Mark Rousskov at 2017-05-18 02:08:11
What I can see, this is still an issue in rustc 1.21.0-nightly (b75d1f0ce 2017-08-02)
bluss at 2017-08-04 22:18:08
Fixed by #45380?
Diff of unoptimized IRs (1.21 stable -> nightly):
; test::id_result ; Function Attrs: uwtable -define void @_ZN4test9id_result17h0e83cb02e8136307E(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture dereferenceable(16)) unnamed_addr #0 { +define void @_ZN4test9id_result17ha0e8372a885d01bfE(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture dereferenceable(16) %a) unnamed_addr #0 { start: - %a = alloca %"core::result::Result<u64, i64>" - %2 = bitcast %"core::result::Result<u64, i64>"* %1 to i8* - %3 = bitcast %"core::result::Result<u64, i64>"* %a to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* %2, i64 16, i32 8, i1 false) - %4 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %a, i32 0, i32 0 - %5 = load i64, i64* %4, !range !0 - switch i64 %5, label %bb2 [ + %1 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %a, i32 0, i32 0 + %2 = load i64, i64* %1, !range !0 + switch i64 %2, label %bb2 [ i64 0, label %bb1 ] (snip)Optimized IR on nightly (1.23.0-nightly (45caff88d 2017-11-11)):
; test::id_result ; Function Attrs: norecurse nounwind uwtable define void @_ZN4test9id_result17ha0e8372a885d01bfE(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture readonly dereferenceable(16) %a) unnamed_addr #0 { start: %1 = bitcast %"core::result::Result<u64, i64>"* %a to <2 x i64>* %2 = load <2 x i64>, <2 x i64>* %1, align 8 %3 = bitcast %"core::result::Result<u64, i64>"* %0 to <2 x i64>* store <2 x i64> %2, <2 x i64>* %3, align 8 ret void }Shotaro Yamada at 2017-11-12 13:50:33
Yep, that's fixed in the original code this bug was filed for. The issue persists if we tweak it just a little (32-bit instead of 64-bit payload):
#![crate_type="lib"] type T = i32; #[no_mangle] pub fn id_result(a: Result<T, T>) -> Result<T, T> { match a { Ok(x) => Ok(x), Err(y) => Err(y), } }bluss at 2017-11-12 14:37:27
@Mark-Simulacrum Is that patch only for
nonnull, notrangemetadata? Looks like this issue is still a problem, it starts with!rangemetadata, but there's stillicmpleft around.Eduard-Mihai Burtescu at 2018-03-28 09:38:43
I don't know.
Mark Rousskov at 2018-03-28 11:03:39
Hmm. Interesting, the second case now becomes
Rust MIR:
fn id_result(_1: Result<i32, i32>) -> Result<i32, i32> { debug a => _1; // in scope 0 at src/lib.rs:6:18: 6:19 let mut _0: std::result::Result<i32, i32>; // return place in scope 0 at src/lib.rs:6:38: 6:50 let mut _2: isize; // in scope 0 at src/lib.rs:8:9: 8:14 let _3: i32; // in scope 0 at src/lib.rs:8:12: 8:13 let mut _4: i32; // in scope 0 at src/lib.rs:8:21: 8:22 let _5: i32; // in scope 0 at src/lib.rs:9:13: 9:14 let mut _6: i32; // in scope 0 at src/lib.rs:9:23: 9:24 scope 1 { debug x => _3; // in scope 1 at src/lib.rs:8:12: 8:13 } scope 2 { debug y => _5; // in scope 2 at src/lib.rs:9:13: 9:14 } bb0: { _2 = discriminant(_1); // scope 0 at src/lib.rs:7:11: 7:12 switchInt(move _2) -> [0_isize: bb3, 1_isize: bb1, otherwise: bb2]; // scope 0 at src/lib.rs:7:5: 7:12 } bb1: { StorageLive(_5); // scope 0 at src/lib.rs:9:13: 9:14 _5 = ((_1 as Err).0: i32); // scope 0 at src/lib.rs:9:13: 9:14 StorageLive(_6); // scope 2 at src/lib.rs:9:23: 9:24 _6 = _5; // scope 2 at src/lib.rs:9:23: 9:24 Deinit(_0); // scope 2 at src/lib.rs:9:19: 9:25 ((_0 as Err).0: i32) = move _6; // scope 2 at src/lib.rs:9:19: 9:25 discriminant(_0) = 1; // scope 2 at src/lib.rs:9:19: 9:25 StorageDead(_6); // scope 2 at src/lib.rs:9:24: 9:25 StorageDead(_5); // scope 0 at src/lib.rs:9:24: 9:25 goto -> bb4; // scope 0 at src/lib.rs:9:24: 9:25 } bb2: { unreachable; // scope 0 at src/lib.rs:7:11: 7:12 } bb3: { StorageLive(_3); // scope 0 at src/lib.rs:8:12: 8:13 _3 = ((_1 as Ok).0: i32); // scope 0 at src/lib.rs:8:12: 8:13 StorageLive(_4); // scope 1 at src/lib.rs:8:21: 8:22 _4 = _3; // scope 1 at src/lib.rs:8:21: 8:22 Deinit(_0); // scope 1 at src/lib.rs:8:18: 8:23 ((_0 as Ok).0: i32) = move _4; // scope 1 at src/lib.rs:8:18: 8:23 discriminant(_0) = 0; // scope 1 at src/lib.rs:8:18: 8:23 StorageDead(_4); // scope 1 at src/lib.rs:8:22: 8:23 StorageDead(_3); // scope 0 at src/lib.rs:8:22: 8:23 goto -> bb4; // scope 0 at src/lib.rs:8:22: 8:23 } bb4: { return; // scope 0 at src/lib.rs:11:2: 11:2 } }LLVMIR:
; ModuleID = 'playground.a04b7f1e-cgu.0' source_filename = "playground.a04b7f1e-cgu.0" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: mustprogress nofree norecurse nosync nounwind nonlazybind readnone uwtable willreturn define { i32, i32 } @id_result(i32 noundef %0, i32 %1) unnamed_addr #0 { start: %switch = icmp ne i32 %0, 0 %. = zext i1 %switch to i32 %2 = insertvalue { i32, i32 } undef, i32 %., 0 %3 = insertvalue { i32, i32 } %2, i32 %1, 1 ret { i32, i32 } %3 } attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind readnone uwtable willreturn "probe-stack"="__rust_probestack" "target-cpu"="x86-64" } !llvm.module.flags = !{!0, !1} !0 = !{i32 7, !"PIC Level", i32 2} !1 = !{i32 2, !"RtLibUseGOT", i32 1}x86 assembly:
id_result: # @id_result # %bb.0: movl %esi, %edx xorl %eax, %eax testl %edi, %edi setne %al retq # -- End functionThe MIR emitted is identical, however, so it has to be regarding how we are handling Results of particular sizes or shapes during lowering of them, somewhere in cg_ssa or cg_llvm.
Jubilee at 2022-07-06 02:13:17
There is no
testqandsetnein case theOkvariant is anu64and theErrvariant ani64and in case it is the other way around, however there is a load and store asResultis passed by reference. If both are anu64or bothi64, theResultis passed by value in two registers and it fails to optimize thetestq/setneaway.bjorn3 at 2022-07-06 11:16:01