Poor performance returning enums larged than a word. Possibly poor code generation?

dea6d3c
Opened by Erick Tryzelaar at 2025-03-01 11:34:37

I've discovered an issue with IoError, and really returning any enums that are larger than 1 word, are running an order of magnitude slower than returning an error enum that's 1 word size. Here's my test case:

extern crate test;

use std::mem;
use std::vec;
use std::io;

const BUFFER_SIZE: uint = 128;

//////////////////////////////////////////////////////////////////////////////

trait Error {
    fn is_eof(&self) -> bool;
}

impl Error for io::IoError {
    fn is_eof(&self) -> bool {
        self.kind == io::EndOfFile
    }
}

#[deriving(Show, PartialEq, Eq)]
enum MyError {
    EndOfFile,
    Error,
    _Error1,
}

impl Error for MyError {
    fn is_eof(&self) -> bool {
        *self == MyError::EndOfFile
    }
}

#[deriving(Show, PartialEq, Eq)]
enum MyError2 {
    EndOfFile,
    Error,
    _Error1(uint),
}

impl Error for MyError2 {
    fn is_eof(&self) -> bool {
        *self == MyError2::EndOfFile
    }
}

impl Error for () {
    fn is_eof(&self) -> bool {
        true
    }
}

impl Error for Box<MyError> {
    fn is_eof(&self) -> bool {
        **self == MyError::EndOfFile
    }
}

//////////////////////////////////////////////////////////////////////////////

fn generate_bytes() -> Vec<u8> {
    let mut bytes = Vec::new();

    for i in range(0i, 1024) {
        bytes.push(i as u8);
    }

    bytes
}

//////////////////////////////////////////////////////////////////////////////

struct Foo11<'a, E> {
    iter: vec::MoveItems<u8>,
    f: |&mut Vec<u8>|: 'a -> Result<(), E>,
}

impl<'a, E: Error> Foo11<'a, E> {
    fn new<'a>(f: |&mut Vec<u8>|: 'a -> Result<(), E>) -> Foo11<'a, E> {
        let buf = Vec::with_capacity(BUFFER_SIZE);

        Foo11 {
            iter: buf.into_iter(),
            f: f,
        }
    }

    fn fill_buf(&mut self) -> Result<bool, E> {
        let mut iter = Vec::new().into_iter();
        mem::swap(&mut iter, &mut self.iter);
        let mut buf = iter.into_inner();
        buf.clear();

        try!((self.f)(&mut buf));

        if buf.is_empty() {
            Ok(false)
        } else {
            self.iter = buf.into_iter();
            Ok(true)
        }
    }
}

impl<'a, E: Error> Iterator<Result<u8, E>> for Foo11<'a, E> {
    fn next(&mut self) -> Option<Result<u8, E>> {
        loop {
            match self.iter.next() {
                Some(value) => { return Some(Ok(value)); }
                None => {
                    match self.fill_buf() {
                        Ok(false) => { return None; }
                        Ok(true) => { }
                        Err(err) => { return Some(Err(err)); }
                    }
                }
            }
        }
    }
}

#[bench]
fn bench_foo11_ioerror(b: &mut test::Bencher) {
    let bytes = generate_bytes();
    b.bytes = bytes.len() as u64;

    b.iter(|| {
        let mut rdr = bytes.as_slice();
        let iter = Foo11::new(|buf| -> Result<(), io::IoError> {
            match rdr.push(BUFFER_SIZE, buf) {
                Ok(_) => Ok(()),
                Err(io::IoError { kind: io::EndOfFile, .. }) => Ok(()),
                Err(err) => Err(err),
            }
        });

        for (idx, item) in iter.enumerate() {
            assert_eq!(idx as u8, item.unwrap());
        }
    })
}

#[bench]
fn bench_foo11_enum_one_word(b: &mut test::Bencher) {
    let bytes = generate_bytes();
    b.bytes = bytes.len() as u64;

    b.iter(|| {
        let mut rdr = bytes.as_slice();
        let iter = Foo11::new(|buf| -> Result<(), MyError> {
            match rdr.push(BUFFER_SIZE, buf) {
                Ok(_) => Ok(()),
                Err(io::IoError { kind: io::EndOfFile, .. }) => Ok(()),
                Err(_) => Err(MyError::Error),
            }
        });

        for (idx, item) in iter.enumerate() {
            assert_eq!(idx as u8, item.unwrap());
        }
    })
}

#[bench]
fn bench_foo11_null(b: &mut test::Bencher) {
    let bytes = generate_bytes();
    b.bytes = bytes.len() as u64;

    b.iter(|| {
        let mut rdr = bytes.as_slice();
        let iter = Foo11::new(|buf| -> Result<(), ()> {
            match rdr.push(BUFFER_SIZE, buf) {
                Ok(_) => Ok(()),
                Err(io::IoError { kind: io::EndOfFile, .. }) => Ok(()),
                Err(_) => Ok(()), //{ panic!() }
            }
        });

        for (idx, item) in iter.enumerate() {
            assert_eq!(idx as u8, item.unwrap());
        }
    })
}

#[bench]
fn bench_foo11_enum_2_words(b: &mut test::Bencher) {
    let bytes = generate_bytes();
    b.bytes = bytes.len() as u64;

    b.iter(|| {
        let mut rdr = bytes.as_slice();
        let iter = Foo11::new(|buf| -> Result<(), MyError2> {
            match rdr.push(BUFFER_SIZE, buf) {
                Ok(_) => Ok(()),
                Err(io::IoError { kind: io::EndOfFile, .. }) => Ok(()),
                Err(_) => Err(MyError2::Error),
            }
        });

        for (idx, item) in iter.enumerate() {
            assert_eq!(idx as u8, item.unwrap());
        }
    })
}

#[bench]
fn bench_foo11_boxed(b: &mut test::Bencher) {
    let bytes = generate_bytes();
    b.bytes = bytes.len() as u64;

    b.iter(|| {
        let mut rdr = bytes.as_slice();
        let iter = Foo11::new(|buf| -> Result<(), Box<MyError>> {
            match rdr.push(BUFFER_SIZE, buf) {
                Ok(_) => Ok(()),
                Err(io::IoError { kind: io::EndOfFile, .. }) => Ok(()),
                Err(_) => Err(box MyError::Error),
            }
        });

        for (idx, item) in iter.enumerate() {
            assert_eq!(idx as u8, item.unwrap());
        }
    })
}

Here are the results:

test bench_foo11_boxed             ... bench:     13754 ns/iter (+/- 4222) = 74 MB/s
test bench_foo11_enum_2_words      ... bench:     15027 ns/iter (+/- 4318) = 68 MB/s
test bench_foo11_enum_one_word     ... bench:      1550 ns/iter (+/- 417) = 660 MB/s
test bench_foo11_ioerror           ... bench:     25003 ns/iter (+/- 8007) = 40 MB/s
test bench_foo11_null              ... bench:       817 ns/iter (+/- 206) = 1253 MB/s

On a related note, @alexcrichton just found a similar case with:

extern crate test;

use std::iter::repeat;

const N: uint = 100000;

#[deriving(Clone)]
struct Foo;
#[deriving(Clone)]
struct Bar {
    name: &'static str,
    desc: Option<String>,
    other: Option<String>,
}

#[bench]
fn b1(b: &mut test::Bencher) {
    b.iter(|| {
        let r: Result<u8, Foo> = Ok(1u8);
        repeat(r).take(N).map(|x| test::black_box(x)).count()
    });
}

#[bench]
fn b2(b: &mut test::Bencher) {
    b.iter(|| {
        let r: Result<u8, Bar> = Ok(1u8);
        repeat(r).take(N).map(|x| test::black_box(x)).count()
    });
}

The assembly for b1 is about 3 instructions, but the assembly for b2 has a ton of mov instructions.

  1. So I looked at this a little deeper. From the LLVM IR, it seems that when LLVM SROAs the Result struct, it ends up spliting the memcopys and memsets for the moves. Unfortunately, it includes the padding elements in the struct, resulting in bloated code and unaligned loads and stores.

    This would be fixed with some TBAA metadata emitted, so is essentially #6736.

    James Miller at 2014-12-15 05:13:57

  2. This is something I've noticed as well.

    Joshua Yanovski at 2014-12-15 19:40:33

  3. Ok, so I did a basic implementation of tbaa.struct and it doesn't look like it fixes the issue.

    James Miller at 2014-12-16 05:38:44

  4. Ok, so I managed to fix up the codegen, hopefully it's still valid. It improved speed a little, but part of the issue is that IoError is 8 words, or 64 bytes on x86-64.

    pub struct IoError {
      pub kind: IoErrorKind // 2 words because a single variant has a uint associated with it
      pub desc: &'static str // 2 words - the size of a string slice
      pub detail: Option<String> // 4 words - One for the discriminant 3 for the String.
    }
    

    It's no surprise that the code using IoError is so much slower when the data is so much bigger.

    James Miller at 2014-12-20 04:33:24

  5. I was talking with @mcpherrrin and @huonw, and they mentioned that we are shrinking the enum tag down to the smallest integer size, which would result in padding and the unsigned stores and loads you found. I wonder if disabling the shrinking would speed things up.

    Erick Tryzelaar at 2014-12-20 04:46:42

  6. @erickt heh, that's exactly the change I made. Instead of just using the smallest integer type, I use the smallest to figure out what the appropriate alignment is, then use the alignment size to set the discriminant type. For some reason LLVM was struggling to figure out that the padding wasn't being written to or read from.

    James Miller at 2014-12-20 04:55:47

  7. For the record, with @alexcrichton example the slower example gets twice as fast with the newer codegen. It's still much slower than the faster example, but that's hardly surprising.

    James Miller at 2014-12-20 05:05:18

  8. Even with @Aatch's patches, we're just doubling the IoError case from 40MB/s to 80MB/s. The destructor we're getting from the description: Option<Str> is really hurting us. Removing that gets us to 287MB/s. We can convert desc: &'static str into a function, which gets rid of another word and to 339MB/s. Finally, if we remove IoErrorKind::ShortWrite(uint) as suggested in the io reform rfc, we get IoError down into a word. This gets us to 731MB/s. Finally, if implement enum compression, then we'd be able to reduce a type like Result<(), IoError> down into a single word, which would get us to the 1200MB/s number.

    #[deriving(Show, PartialEq, Eq)]
    enum MyError3Kind {
        EndOfFile,
        Error,
        _Error1, //(uint),
    }
    
    #[deriving(Show, PartialEq, Eq)]
    struct MyError3 {
        kind: MyError3Kind,
        //desc: &'static str,
        //description: Option<String>,
    }
    
    impl Error for MyError3 {
        fn is_eof(&self) -> bool {
            self.kind == MyError3Kind::EndOfFile
        }
    }
    
    #[bench]
    fn bench_foo11_enum_smaller_error(b: &mut test::Bencher) {
        let bytes = generate_bytes();
        b.bytes = bytes.len() as u64;
    
        b.iter(|| {
            let mut rdr = bytes.as_slice();
            let iter = Foo11::new(|buf| -> Result<(), MyError3> {
                match rdr.push(BUFFER_SIZE, buf) {
                    Ok(_) => Ok(()),
                    Err(io::IoError { kind: io::EndOfFile, .. }) => Ok(()),
                    Err(_) => {
                        Err(MyError3 {
                            kind: MyError3Kind::Error,
                            //desc: "",
                            //description: Some("foo".to_string()),
                        })
                    }
                }
            });
    
            for (idx, item) in iter.enumerate() {
                assert_eq!(idx as u8, item.unwrap());
            }
        })
    }
    

    Erick Tryzelaar at 2014-12-21 02:26:30

  9. CC @mitsuhiko

    Aria Desires at 2014-12-21 02:39:16

  10. Giant return types simply don't exist in C libraries, even though C structs are first-class. The canonical approach is to pass an extra pointer:

    fn foo(..., _ret: &mut IoError<T>);
    
    fn bar(..., _ret: &mut IoError<T>) {
        foo(..., _ret);
        if !_ret.is_ok() { return; } // no need to copy
    
        // ...
    
        // write our own value when we're done
        *_ret = ...;
    }
    
    fn baz() {
         let mut _ret: IoError<T> = uninitialized();
         bar(..., &mut _ret);
    }
    

    LLVM doesn't have the freedom to change cross-library signatures, but we can probably do this in the Rust calling convention without API breakage.

    Deleted user at 2014-12-26 15:28:39

  11. Rust implements large return types via an out pointer similar to the manual version in C.

    Huon Wilson at 2014-12-26 20:58:10

  12. @huonw: That's already how the platform ABIs handle large return values anyway.

    Daniel Micay at 2014-12-26 21:29:08

  13. The x86_64 ABI uses a higher threshold than Rust's choice of 1 word though.

    Daniel Micay at 2014-12-26 21:29:40

  14. Triage: updated code:

    #![feature(test)]
    extern crate test;
    
    use std::iter::repeat;
    
    const N: usize = 100000;
    
    #[derive(Clone)]
    struct Foo;
    #[derive(Clone)]
    struct Bar {
        name: &'static str,
        desc: Option<String>,
        other: Option<String>,
    }
    
    #[bench]
    fn b1(b: &mut test::Bencher) {
        b.iter(|| {
            let r: Result<u8, Foo> = Ok(1u8);
            repeat(r).take(N).map(|x| test::black_box(x)).count()
        });
    }
    
    #[bench]
    fn b2(b: &mut test::Bencher) {
        b.iter(|| {
            let r: Result<u8, Bar> = Ok(1u8);
            repeat(r).take(N).map(|x| test::black_box(x)).count()
        });
    }
    

    updated benchmarks:

    running 2 tests
    test b1 ... bench:      25,617 ns/iter (+/- 2,836)
    test b2 ... bench:   1,093,834 ns/iter (+/- 50,578)
    

    Steve Klabnik at 2018-09-24 18:00:08

  15. That benchmark on rustc 1.53.0-nightly (07e0e2ec2 2021-03-24) for me returns:

    running 2 tests
    test b1 ... bench:      25,082 ns/iter (+/- 26)
    test b2 ... bench:     501,483 ns/iter (+/- 1,637)
    

    If b2 bench will be changed to

    #[bench]
    fn b2(b: &mut test::Bencher) {
        b.iter(|| {
            let r: Result<u8, Bar> = Ok(1u8);
            iter::repeat_with(||r.as_ref()).take(N).map(|x| test::black_box(x)).count()
        });
    }
    

    results will be

    running 2 tests
    test b1 ... bench:      25,081 ns/iter (+/- 44)
    test b2 ... bench:      50,160 ns/iter (+/- 155)
    

    but this can be not related to actual problem.

    klensy at 2021-03-29 17:50:59

  16. With rustc 1.75.0-nightly (249624b50 2023-10-20):

    running 2 tests                                   
    test b1 ... bench:      21,169 ns/iter (+/- 377)  
    test b2 ... bench:     108,729 ns/iter (+/- 1,838)
    

    Ivan Stanković at 2023-10-23 22:13:20