Identity Result match mapping should optimize to an identity function

5761e60
Opened by bluss at 2023-10-08 20:50:24

This was isolated by @stjepang in #37939

This program: (playground)

pub fn id_result(a: Result<u64, i64>) -> Result<u64, i64> {
    match a {
        Ok(x) => Ok(x),
        Err(y) => Err(y),
    }
}

Should optimize to something that just copies the input to the output, with no conditionals.

Output on rustc 1.15.0-nightly (daf8c1dfc 2016-12-05) using -Copt-level=3 --emit=llvm-ir is this:

; ModuleID = 'id_result.cgu-0.rs'
source_filename = "id_result.cgu-0.rs"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"core::result::Result<u64, i64>" = type { i64, [0 x i64], [1 x i64] }

; Function Attrs: norecurse nounwind uwtable
define void @id_result(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture readonly dereferenceable(16)) unnamed_addr #0 {
entry-block:
  %a.sroa.0.0..sroa_idx = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %1, i64 0, i32 0
  %a.sroa.0.0.copyload = load i64, i64* %a.sroa.0.0..sroa_idx, align 8
  %a.sroa.4.0..sroa_idx2 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %1, i64 0, i32 2, i64 0
  %a.sroa.4.0.copyload = load i64, i64* %a.sroa.4.0..sroa_idx2, align 8
  %2 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %0, i64 0, i32 0
  %not.switch = icmp ne i64 %a.sroa.0.0.copyload, 0
  %. = zext i1 %not.switch to i64
  store i64 %., i64* %2, align 8
  %3 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %0, i64 0, i32 2, i64 0
  store i64 %a.sroa.4.0.copyload, i64* %3, align 8
  ret void
}

attributes #0 = { norecurse nounwind uwtable }
  1. Just leaving a note: This fix in LLVM will probably resolve the issue.

    Deleted user at 2016-12-15 18:45:07

  2. copy propagation in MIR for function arguments (added in #38332) can cause this to optimize correctly, at least for Result<u64, i64> (and not for Result<u32, i32>). This optimization is never used by default though.

    the diff in unoptimized IR just points to the range metadata ending up somewhere it's not removed.

    bluss at 2016-12-16 22:47:44

  3. So according to the issue @stjepang notes, it looks like this may have been merged into LLVM trunk on Mar 22 2017; in https://reviews.llvm.org/rL298540. It's possible we could backport this? I'm not sure, but this could be https://github.com/rust-lang/rust/pull/40914.

    Mark Rousskov at 2017-05-18 02:08:11

  4. What I can see, this is still an issue in rustc 1.21.0-nightly (b75d1f0ce 2017-08-02)

    bluss at 2017-08-04 22:18:08

  5. Fixed by #45380?

    Diff of unoptimized IRs (1.21 stable -> nightly):

     ; test::id_result
     ; Function Attrs: uwtable
    -define void @_ZN4test9id_result17h0e83cb02e8136307E(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
    +define void @_ZN4test9id_result17ha0e8372a885d01bfE(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture dereferenceable(16) %a) unnamed_addr #0 {
     start:
    -  %a = alloca %"core::result::Result<u64, i64>"
    -  %2 = bitcast %"core::result::Result<u64, i64>"* %1 to i8*
    -  %3 = bitcast %"core::result::Result<u64, i64>"* %a to i8*
    -  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* %2, i64 16, i32 8, i1 false)
    -  %4 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %a, i32 0, i32 0
    -  %5 = load i64, i64* %4, !range !0
    -  switch i64 %5, label %bb2 [
    +  %1 = getelementptr inbounds %"core::result::Result<u64, i64>", %"core::result::Result<u64, i64>"* %a, i32 0, i32 0
    +  %2 = load i64, i64* %1, !range !0
    +  switch i64 %2, label %bb2 [
         i64 0, label %bb1
       ]
    (snip)
    

    Optimized IR on nightly (1.23.0-nightly (45caff88d 2017-11-11)):

    ; test::id_result
    ; Function Attrs: norecurse nounwind uwtable
    define void @_ZN4test9id_result17ha0e8372a885d01bfE(%"core::result::Result<u64, i64>"* noalias nocapture sret dereferenceable(16), %"core::result::Result<u64, i64>"* noalias nocapture readonly dereferenceable(16) %a) unnamed_addr #0 {
    start:
      %1 = bitcast %"core::result::Result<u64, i64>"* %a to <2 x i64>*
      %2 = load <2 x i64>, <2 x i64>* %1, align 8
      %3 = bitcast %"core::result::Result<u64, i64>"* %0 to <2 x i64>*
      store <2 x i64> %2, <2 x i64>* %3, align 8
      ret void
    }
    

    Shotaro Yamada at 2017-11-12 13:50:33

  6. Yep, that's fixed in the original code this bug was filed for. The issue persists if we tweak it just a little (32-bit instead of 64-bit payload):

    #![crate_type="lib"]
    
    type T = i32;
    
    #[no_mangle]
    pub fn id_result(a: Result<T, T>) -> Result<T, T> {
        match a {
            Ok(x) => Ok(x),
            Err(y) => Err(y),
        }
    }
    

    bluss at 2017-11-12 14:37:27

  7. @Mark-Simulacrum Is that patch only for nonnull, not range metadata? Looks like this issue is still a problem, it starts with !range metadata, but there's still icmp left around.

    Eduard-Mihai Burtescu at 2018-03-28 09:38:43

  8. I don't know.

    Mark Rousskov at 2018-03-28 11:03:39

  9. Hmm. Interesting, the second case now becomes

    Rust MIR:

    fn id_result(_1: Result<i32, i32>) -> Result<i32, i32> {
        debug a => _1;                       // in scope 0 at src/lib.rs:6:18: 6:19
        let mut _0: std::result::Result<i32, i32>; // return place in scope 0 at src/lib.rs:6:38: 6:50
        let mut _2: isize;                   // in scope 0 at src/lib.rs:8:9: 8:14
        let _3: i32;                         // in scope 0 at src/lib.rs:8:12: 8:13
        let mut _4: i32;                     // in scope 0 at src/lib.rs:8:21: 8:22
        let _5: i32;                         // in scope 0 at src/lib.rs:9:13: 9:14
        let mut _6: i32;                     // in scope 0 at src/lib.rs:9:23: 9:24
        scope 1 {
            debug x => _3;                   // in scope 1 at src/lib.rs:8:12: 8:13
        }
        scope 2 {
            debug y => _5;                   // in scope 2 at src/lib.rs:9:13: 9:14
        }
    
        bb0: {
            _2 = discriminant(_1);           // scope 0 at src/lib.rs:7:11: 7:12
            switchInt(move _2) -> [0_isize: bb3, 1_isize: bb1, otherwise: bb2]; // scope 0 at src/lib.rs:7:5: 7:12
        }
    
        bb1: {
            StorageLive(_5);                 // scope 0 at src/lib.rs:9:13: 9:14
            _5 = ((_1 as Err).0: i32);       // scope 0 at src/lib.rs:9:13: 9:14
            StorageLive(_6);                 // scope 2 at src/lib.rs:9:23: 9:24
            _6 = _5;                         // scope 2 at src/lib.rs:9:23: 9:24
            Deinit(_0);                      // scope 2 at src/lib.rs:9:19: 9:25
            ((_0 as Err).0: i32) = move _6;  // scope 2 at src/lib.rs:9:19: 9:25
            discriminant(_0) = 1;            // scope 2 at src/lib.rs:9:19: 9:25
            StorageDead(_6);                 // scope 2 at src/lib.rs:9:24: 9:25
            StorageDead(_5);                 // scope 0 at src/lib.rs:9:24: 9:25
            goto -> bb4;                     // scope 0 at src/lib.rs:9:24: 9:25
        }
    
        bb2: {
            unreachable;                     // scope 0 at src/lib.rs:7:11: 7:12
        }
    
        bb3: {
            StorageLive(_3);                 // scope 0 at src/lib.rs:8:12: 8:13
            _3 = ((_1 as Ok).0: i32);        // scope 0 at src/lib.rs:8:12: 8:13
            StorageLive(_4);                 // scope 1 at src/lib.rs:8:21: 8:22
            _4 = _3;                         // scope 1 at src/lib.rs:8:21: 8:22
            Deinit(_0);                      // scope 1 at src/lib.rs:8:18: 8:23
            ((_0 as Ok).0: i32) = move _4;   // scope 1 at src/lib.rs:8:18: 8:23
            discriminant(_0) = 0;            // scope 1 at src/lib.rs:8:18: 8:23
            StorageDead(_4);                 // scope 1 at src/lib.rs:8:22: 8:23
            StorageDead(_3);                 // scope 0 at src/lib.rs:8:22: 8:23
            goto -> bb4;                     // scope 0 at src/lib.rs:8:22: 8:23
        }
    
        bb4: {
            return;                          // scope 0 at src/lib.rs:11:2: 11:2
        }
    }
    

    LLVMIR:

    ; ModuleID = 'playground.a04b7f1e-cgu.0'
    source_filename = "playground.a04b7f1e-cgu.0"
    target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
    target triple = "x86_64-unknown-linux-gnu"
    
    ; Function Attrs: mustprogress nofree norecurse nosync nounwind nonlazybind readnone uwtable willreturn
    define { i32, i32 } @id_result(i32 noundef %0, i32 %1) unnamed_addr #0 {
    start:
      %switch = icmp ne i32 %0, 0
      %. = zext i1 %switch to i32
      %2 = insertvalue { i32, i32 } undef, i32 %., 0
      %3 = insertvalue { i32, i32 } %2, i32 %1, 1
      ret { i32, i32 } %3
    }
    
    attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind readnone uwtable willreturn "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
    
    !llvm.module.flags = !{!0, !1}
    
    !0 = !{i32 7, !"PIC Level", i32 2}
    !1 = !{i32 2, !"RtLibUseGOT", i32 1}
    

    x86 assembly:

    
    id_result:                              # @id_result
    # %bb.0:
    	movl	%esi, %edx
    	xorl	%eax, %eax
    	testl	%edi, %edi
    	setne	%al
    	retq
                                            # -- End function
    

    The MIR emitted is identical, however, so it has to be regarding how we are handling Results of particular sizes or shapes during lowering of them, somewhere in cg_ssa or cg_llvm.

    Jubilee at 2022-07-06 02:13:17

  10. There is no testq and setne in case the Ok variant is an u64 and the Err variant an i64 and in case it is the other way around, however there is a load and store as Result is passed by reference. If both are an u64 or both i64, the Result is passed by value in two registers and it fails to optimize the testq/setne away.

    bjorn3 at 2022-07-06 11:16:01