Mutable references as function arguments generate inefficient code in trivial cases

2b719cd
Opened by mahkoh at 2023-04-05 17:38:40
#![crate_type = "lib"]

#[no_mangle]
pub fn good1(x: &mut bool) {
    *x = *x;
}

#[no_mangle]
pub fn good2(x: &mut bool) {
    if *x {
        *x = *x;
    } else {
        *x = *x;
    }
}

#[no_mangle]
pub fn good3(x: &mut bool) {
    let mut y = *x;

    if y {
        y = y;
    }

    *x = y;
}

#[no_mangle]
pub fn good4(x: &mut bool) {
    let mut y = *x;

    if y {
        y = false;
        y = true;
    }

    *x = y;
}

#[no_mangle]
pub fn bad1(x: &mut bool) {
    if *x {
        *x = *x;
    }
}

#[no_mangle]
pub fn bad2(x: &mut bool) {
    if *x {
        *x = false;
        *x = true;
    }
}

#[no_mangle]
pub fn bad3(x: &mut bool) {
    if !*x {
        *x = *x;
    }
}
good1:
    retq

good2:
    retq

good3:
    retq

good4:
    retq

bad1:
    cmpb    $0, (%rdi)
    je  .LBB4_2
    movb    $1, (%rdi)
.LBB4_2:
    retq

bad2:
    cmpb    $0, (%rdi)
    je  .LBB5_2
    movb    $1, (%rdi)
.LBB5_2:
    retq

bad3:
    cmpb    $0, (%rdi)
    jne .LBB6_2
    movb    $0, (%rdi)
.LBB6_2:
    retq
  1. Looks like an LLVM problem. Does clang generate better code?

    Ariel Ben-Yehuda at 2015-11-25 14:10:40

  2. Triage: this is still an issue. From what I can tell, only bad1 and bad2 are still a problem in Rust, with bad3 being solved. I may have incorrectly transformed the code above to C (C++?), though, so no guarantees...

    Interestingly, bad3 with rustc -O compiles to a retq, but clang 4.0.0 doesn't optimize it away.

    However, for bad1 and bad2 the assembly looks the same:

    void bad1(bool *x) {
        if (*x) {
            *x = *x;
        }
    }
    
    void bad2(bool *x) {
        if (*x) {
            *x = false;
            *x = true;
        }
    }
    
    void bad3(bool *x) {
        if (!*x) {
            *x = *x;
        }
    }
    

    compiles to:

    bad1(bool*):                              # @bad1(bool*)
            cmpb    $0, (%rdi)
            je      .LBB0_2
            movb    $1, (%rdi)
    .LBB0_2:
            retq
    
    bad2(bool*):                              # @bad2(bool*)
            cmpb    $0, (%rdi)
            je      .LBB1_2
            movb    $1, (%rdi)
    .LBB1_2:
            retq
    
    bad3(bool*):                              # @bad3(bool*)
            cmpb    $0, (%rdi)
            jne     .LBB2_2
            movb    $0, (%rdi)
    .LBB2_2:
            retq
    

    while Rust compiles the original code to:

    good1:
        retq
    
    good2:
        retq
    
    good3:
        retq
    
    good4:
        retq
    
    bad1:
        cmpb   $0x0,(%rdi)
        je     8 <bad1+0x8>
        movb   $0x1,(%rdi)
        retq
    
    bad2:
        cmpb   $0x0,(%rdi)
        je     8 <bad2+0x8>
        movb   $0x1,(%rdi)
        retq
    
    bad3:
        retq
    

    Mark Rousskov at 2017-04-15 16:33:19

  3. Seems like this regressed even more: https://godbolt.org/g/n3cxcD

    Am I misinterpreting godbolt's output? Why are none of the good* functions just retq anymore?

    Cc @rust-lang/wg-codegen

    Anthony Ramine at 2018-04-02 14:41:24

  4. Godbolt requests debug info which in turn disables frame pointer elimination. The push mov pop instructions are the frame pointer bookkeeping.

    On Mon, Apr 2, 2018, 17:41 Anthony Ramine notifications@github.com wrote:

    Seems like this regressed even more: https://godbolt.org/g/n3cxcD

    Am I misinterpreting godbolt's output? Why are none of the good* functions just retq anymore?

    Cc @rust-lang/wg-codegen https://github.com/orgs/rust-lang/teams/wg-codegen

    — You are receiving this because you are on a team that was mentioned. Reply to this email directly, view it on GitHub https://github.com/rust-lang/rust/issues/30041#issuecomment-377944268, or mute the thread https://github.com/notifications/unsubscribe-auth/AApc0ue4XYfP0l4Ae1JF-ESAERAUOZM4ks5tkji1gaJpZM4GoxxM .

    Simonas Kazlauskas at 2018-04-03 09:23:50

  5. @nagisa Thanks! Would have never guessed that myself. What is the flag to omit debug info on Godbolt?

    Anthony Ramine at 2018-04-03 09:40:21

  6. You cannot disable debug-info generation for godbolt because that’s what it relies on to provide its basic functionality.

    That being said this should help with the frame pointer stuff once it lands.

    Simonas Kazlauskas at 2018-04-03 10:05:31

  7. A Godbolt with more comparative data: https://rust.godbolt.org/z/cs69hPnT6 Tonight, the "good" functions no longer have problems. bad3 is gone. bad1 and bad2 now merge as a function because they generate the same code.

    bad1:
            cmp     byte ptr [rdi], 0
            je      .LBB1_2
            mov     byte ptr [rdi], 1
    .LBB1_2:
            ret
    

    This seems to be true of both rustc and clang.

    The unoptimized LLVMIR currently looks like

    define void @bad1(i8* align 1 %x) unnamed_addr #0 {
      %0 = load i8, i8* %x, align 1, !range !2, !noundef !3
      %_2 = trunc i8 %0 to i1
      br i1 %_2, label %bb1, label %bb2
    
    bb2:                                              ; preds = %bb1, %start
      ret void
    
    bb1:                                              ; preds = %start
      %1 = load i8, i8* %x, align 1, !range !2, !noundef !3
      %_3 = trunc i8 %1 to i1
      %2 = zext i1 %_3 to i8
      store i8 %2, i8* %x, align 1
      br label %bb2
    }
    
    define void @bad2(i8* align 1 %x) unnamed_addr #0 {
      %0 = load i8, i8* %x, align 1, !range !2, !noundef !3
      %_2 = trunc i8 %0 to i1
      br i1 %_2, label %bb1, label %bb2
    
    bb2:                                              ; preds = %bb1, %start
      ret void
    
    bb1:                                              ; preds = %start
      store i8 0, i8* %x, align 1
      store i8 1, i8* %x, align 1
      br label %bb2
    }
    
    define void @bad3(i8* align 1 %x) unnamed_addr #0 {
      %0 = load i8, i8* %x, align 1, !range !2, !noundef !3
      %_3 = trunc i8 %0 to i1
      %_2 = xor i1 %_3, true
      br i1 %_2, label %bb1, label %bb2
    
    bb2:                                              ; preds = %bb1, %start
      ret void
    
    bb1:                                              ; preds = %start
      %1 = load i8, i8* %x, align 1, !range !2, !noundef !3
      %_4 = trunc i8 %1 to i1
      %2 = zext i1 %_4 to i8
      store i8 %2, i8* %x, align 1
      br label %bb2
    }
    
    attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
    

    It might be useful to figure out why bad3 and most of the "good" ones can get away with generating even heavier functions and yet be optimized completely away. good1 seems to just be tiny enough that of course it's trivial to optimize.

    Jubilee at 2022-07-11 03:44:27