Result-based error handling optimizes very poorly for large payloads
The overhead seems to mostly be extra memcopies.
Method
I tested three cases:
direct() -> Result<Huge, ()>direct_boxed() -> Result<Box<Huge>, ()>indirect(&mut Huge) -> Result<(), ()>
#![feature(test, box_syntax)]
extern crate test;
use std::io::{Write, Error, ErrorKind};
const DATA_SIZE: usize = 200;
struct Huge {
data: [u8; DATA_SIZE],
}
struct HugeIter<'a> {
cur_val: Huge,
buf: &'a [u8],
}
impl<'a> HugeIter<'a> {
fn next_direct(&mut self) -> Option<&Huge> {
if let Ok(val) = parse_huge_direct(&mut self.buf) {
self.cur_val = val;
Some(&self.cur_val)
} else {
None
}
}
fn next_indirect(&mut self) -> Option<&Huge> {
if let Ok(()) = parse_huge_indirect(&mut self.buf, &mut self.cur_val) {
Some(&self.cur_val)
} else {
None
}
}
}
struct HugeIterBoxed<'a> {
cur_val: Box<Huge>,
buf: &'a [u8],
}
impl<'a> HugeIterBoxed<'a> {
fn next_direct_boxed(&mut self) -> Option<&Huge> {
if let Ok(val) = parse_huge_direct_boxed(&mut self.buf) {
self.cur_val = val;
Some(&self.cur_val)
} else {
None
}
}
}
fn parse_huge_indirect(src: &mut &[u8], dest: &mut Huge) -> Result<(), Error> {
if src.len() < DATA_SIZE { return Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")) }
(&mut dest.data[..]).write_all(&src[..DATA_SIZE])?;
*src = &src[DATA_SIZE..];
Ok(())
}
fn parse_huge_direct(src: &mut &[u8]) -> Result<Huge, Error> {
unsafe {
if src.len() < DATA_SIZE { return Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")) }
let mut val = Huge { data: ::std::mem::uninitialized() };
(&mut val.data[..]).write_all(&src[..DATA_SIZE])?;
*src = &src[DATA_SIZE..];
Ok(val)
}
}
fn parse_huge_direct_boxed(src: &mut &[u8]) -> Result<Box<Huge>, Error> {
unsafe {
if src.len() < DATA_SIZE { return Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")) }
let mut val = box Huge { data: ::std::mem::uninitialized() };
(&mut val.data[..]).write_all(&src[..DATA_SIZE])?;
*src = &src[DATA_SIZE..];
Ok(val)
}
}
#[bench]
fn bench_direct(b: &mut test::Bencher) {
let data = test::black_box(vec![0; 1_000_000]);
b.iter(|| {
let mut iter = HugeIter { cur_val: Huge { data: [0; 200] }, buf: &data };
let mut total: u8 = 0;
while let Some(val) = iter.next_direct() {
total += val.data[..].iter().cloned().sum();
}
total
});
}
#[bench]
fn bench_indirect(b: &mut test::Bencher) {
let data = test::black_box(vec![0; 1_000_000]);
b.iter(|| {
let mut iter = HugeIter { cur_val: Huge { data: [0; 200] }, buf: &data };
let mut total: u8 = 0;
while let Some(val) = iter.next_indirect() {
total += val.data[..].iter().cloned().sum();
}
total
});
}
#[bench]
fn bench_direct_boxed(b: &mut test::Bencher) {
let data = test::black_box(vec![0; 1_000_000]);
b.iter(|| {
let mut iter = HugeIterBoxed { cur_val: box Huge { data: [0; 200] }, buf: &data };
let mut total: u8 = 0;
while let Some(val) = iter.next_direct_boxed() {
total += val.data[..].iter().cloned().sum();
}
total
});
}
</details>
Bench Results:
test bench_direct_boxed ... bench: 136,621 ns/iter (+/- 32,300)
test bench_direct ... bench: 129,206 ns/iter (+/- 24,201)
test bench_indirect ... bench: 83,301 ns/iter (+/- 21,078)
ASM
<details> <summary>ASM for bench_direct_boxed's closure</summary>__ZN4test13ns_iter_inner17h91bd325045991a14E:
Lfunc_begin1:
.cfi_startproc
.cfi_personality 155, _rust_eh_personality
.cfi_lsda 16, Lexception1
pushq %rbp
Lcfi8:
.cfi_def_cfa_offset 16
Lcfi9:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi10:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $104, %rsp
Lcfi11:
.cfi_offset %rbx, -56
Lcfi12:
.cfi_offset %r12, -48
Lcfi13:
.cfi_offset %r13, -40
Lcfi14:
.cfi_offset %r14, -32
Lcfi15:
.cfi_offset %r15, -24
movq %rsi, %rbx
movq %rdi, -56(%rbp)
callq __ZN3std4time7Instant3now17hc5a5b38c2510cf8bE
movq %rax, -72(%rbp)
movq %rbx, -48(%rbp)
testq %rbx, %rbx
je LBB1_13
xorl %r15d, %r15d
.p2align 4, 0x90
LBB1_2:
movl $200, %edi
movl $1, %esi
callq ___rust_allocate
movq %rax, %r14
testq %r14, %r14
je LBB1_25
movq %r15, -64(%rbp)
xorl %r13d, %r13d
xorl %esi, %esi
movl $200, %edx
movq %r14, %rdi
callq _memset
movq -56(%rbp), %rax
movq (%rax), %rax
movq 16(%rax), %r12
cmpq $200, %r12
jae LBB1_14
movq %r14, %rbx
jmp LBB1_5
.p2align 4, 0x90
LBB1_14:
movq (%rax), %r15
xorl %r13d, %r13d
.p2align 4, 0x90
LBB1_15:
movl $200, %edi
movl $1, %esi
callq ___rust_allocate
movq %rax, %rbx
testq %rbx, %rbx
je LBB1_16
movl $200, %edx
movq %rbx, %rdi
movq %r15, %rsi
callq _memcpy
movl $200, %esi
movl $1, %edx
movq %r14, %rdi
callq ___rust_deallocate
movdqu (%rbx), %xmm0
movdqu 16(%rbx), %xmm1
movdqu 32(%rbx), %xmm2
movdqu 48(%rbx), %xmm3
paddb %xmm0, %xmm2
paddb %xmm1, %xmm3
movdqu 64(%rbx), %xmm0
movdqu 80(%rbx), %xmm1
movdqu 96(%rbx), %xmm4
movdqu 112(%rbx), %xmm5
paddb %xmm0, %xmm4
paddb %xmm2, %xmm4
paddb %xmm1, %xmm5
paddb %xmm3, %xmm5
movdqu 128(%rbx), %xmm0
movdqu 144(%rbx), %xmm1
movdqu 160(%rbx), %xmm2
movdqu 176(%rbx), %xmm3
paddb %xmm0, %xmm2
paddb %xmm4, %xmm2
paddb %xmm1, %xmm3
paddb %xmm5, %xmm3
paddb %xmm2, %xmm3
pshufd $78, %xmm3, %xmm0
paddb %xmm3, %xmm0
pshufd $229, %xmm0, %xmm1
paddb %xmm0, %xmm1
movdqa %xmm1, %xmm0
psrld $16, %xmm0
paddb %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrlw $8, %xmm1
paddb %xmm0, %xmm1
movdqa %xmm1, -144(%rbp)
movzbl 192(%rbx), %eax
addb -144(%rbp), %al
addb 193(%rbx), %al
addb 194(%rbx), %al
addb 195(%rbx), %al
addb 196(%rbx), %al
addb 197(%rbx), %al
addb 198(%rbx), %al
addb 199(%rbx), %al
addq $200, %r15
addq $-200, %r12
addb %al, %r13b
cmpq $200, %r12
movq %rbx, %r14
jae LBB1_15
LBB1_5:
Ltmp6:
movl $5, %esi
leaq _str.8(%rip), %rdi
callq __ZN3std5error205_$LT$impl$u20$core..convert..From$LT$$RF$$u27$b$u20$str$GT$$u20$for$u20$alloc..boxed..Box$LT$std..error..Error$u20$$u2b$$u20$core..marker..Sync$u20$$u2b$$u20$core..marker..Send$u20$$u2b$$u20$$u27$a$GT$$GT$4from17hbabb0ef23dd78831E
movq %rdx, %rcx
Ltmp7:
leaq -88(%rbp), %r12
Ltmp8:
movl $17, %esi
movq %r12, %rdi
movq %rax, %rdx
callq __ZN3std2io5error5Error4_new17h399b6fceaf2b2ad7E
Ltmp9:
cmpb $2, -88(%rbp)
movq -64(%rbp), %r15
jb LBB1_12
movq -80(%rbp), %r14
movq 8(%r14), %rdi
movq 16(%r14), %rax
Ltmp11:
callq *(%rax)
Ltmp12:
movq 16(%r14), %rax
movq 8(%rax), %rsi
testq %rsi, %rsi
je LBB1_11
movq 8(%r14), %rdi
movq 16(%rax), %rdx
callq ___rust_deallocate
LBB1_11:
movl $24, %esi
movl $8, %edx
movq %r14, %rdi
callq ___rust_deallocate
LBB1_12:
incq %r15
movl $200, %esi
movl $1, %edx
movq %rbx, %rdi
callq ___rust_deallocate
movb %r13b, -88(%rbp)
## InlineAsm Start
## InlineAsm End
cmpq -48(%rbp), %r15
jb LBB1_2
LBB1_13:
leaq -104(%rbp), %rdi
leaq -72(%rbp), %rsi
callq __ZN3std4time7Instant7elapsed17h4a152db7d42dfbccE
movq -104(%rbp), %rax
movl -96(%rbp), %ecx
movq %rax, -120(%rbp)
movl %ecx, -112(%rbp)
leaq -120(%rbp), %rdi
callq __ZN4test11ns_from_dur17h77590a35c757ea7bE
addq $104, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
LBB1_16:
Ltmp3:
callq __ZN5alloc3oom3oom17he0fcc1585169f6ffE
Ltmp4:
LBB1_25:
callq __ZN5alloc3oom3oom17he0fcc1585169f6ffE
LBB1_23:
Ltmp5:
movq %rax, %r15
movq %r14, %rbx
jmp LBB1_24
LBB1_18:
Ltmp13:
movq %rax, %r15
movq 16(%r14), %rax
movq 8(%rax), %rsi
testq %rsi, %rsi
je LBB1_20
movq 8(%r14), %rdi
movq 16(%rax), %rdx
callq ___rust_deallocate
LBB1_20:
movl $24, %esi
movl $8, %edx
movq %r14, %rdi
callq ___rust_deallocate
jmp LBB1_24
LBB1_22:
Ltmp10:
movq %rax, %r15
LBB1_24:
movl $200, %esi
movl $1, %edx
movq %rbx, %rdi
callq ___rust_deallocate
movq %r15, %rdi
callq __Unwind_Resume
Lfunc_end1:
.cfi_endproc
.section __TEXT,__gcc_except_tab
.p2align 2
GCC_except_table1:
Lexception1:
.byte 255
.byte 155
.asciz "\320"
.byte 3
.byte 78
Lset7 = Lfunc_begin1-Lfunc_begin1
.long Lset7
Lset8 = Ltmp6-Lfunc_begin1
.long Lset8
.long 0
.byte 0
Lset9 = Ltmp6-Lfunc_begin1
.long Lset9
Lset10 = Ltmp9-Ltmp6
.long Lset10
Lset11 = Ltmp10-Lfunc_begin1
.long Lset11
.byte 0
Lset12 = Ltmp11-Lfunc_begin1
.long Lset12
Lset13 = Ltmp12-Ltmp11
.long Lset13
Lset14 = Ltmp13-Lfunc_begin1
.long Lset14
.byte 0
Lset15 = Ltmp12-Lfunc_begin1
.long Lset15
Lset16 = Ltmp3-Ltmp12
.long Lset16
.long 0
.byte 0
Lset17 = Ltmp3-Lfunc_begin1
.long Lset17
Lset18 = Ltmp4-Ltmp3
.long Lset18
Lset19 = Ltmp5-Lfunc_begin1
.long Lset19
.byte 0
Lset20 = Ltmp4-Lfunc_begin1
.long Lset20
Lset21 = Lfunc_end1-Ltmp4
.long Lset21
.long 0
.byte 0
.p2align 2
.section __TEXT,__text,regular,pure_instructions
.p2align 4, 0x90
</details>
<details>
<summary>ASM for bench_direct's closure</summary>
Lfunc_begin0:
.cfi_startproc
.cfi_personality 155, _rust_eh_personality
.cfi_lsda 16, Lexception0
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $520, %rsp
Lcfi3:
.cfi_offset %rbx, -56
Lcfi4:
.cfi_offset %r12, -48
Lcfi5:
.cfi_offset %r13, -40
Lcfi6:
.cfi_offset %r14, -32
Lcfi7:
.cfi_offset %r15, -24
movq %rsi, %rbx
movq %rdi, -72(%rbp)
callq __ZN3std4time7Instant3now17hc5a5b38c2510cf8bE
movq %rax, -96(%rbp)
movq %rbx, -64(%rbp)
testq %rbx, %rbx
je LBB0_17
xorl %eax, %eax
.p2align 4, 0x90
LBB0_2:
incq %rax
movq %rax, -80(%rbp)
movq -72(%rbp), %rax
movq (%rax), %rax
movq (%rax), %rcx
movq 16(%rax), %r14
pxor %xmm0, %xmm0
movdqa %xmm0, -176(%rbp)
movdqa %xmm0, -192(%rbp)
movdqa %xmm0, -208(%rbp)
movdqa %xmm0, -224(%rbp)
movdqa %xmm0, -240(%rbp)
movdqa %xmm0, -256(%rbp)
movdqa %xmm0, -272(%rbp)
movdqa %xmm0, -288(%rbp)
movdqa %xmm0, -304(%rbp)
movdqa %xmm0, -320(%rbp)
movdqa %xmm0, -336(%rbp)
movdqa %xmm0, -352(%rbp)
movq $0, -160(%rbp)
movq %rcx, -152(%rbp)
movq %r14, -144(%rbp)
xorl %r15d, %r15d
cmpq $200, %r14
jb LBB0_7
jmp LBB0_4
.p2align 4, 0x90
LBB0_6:
movdqu (%rax), %xmm0
movdqu 16(%rax), %xmm1
movdqu 32(%rax), %xmm2
movdqu 48(%rax), %xmm3
paddb %xmm0, %xmm2
paddb %xmm1, %xmm3
movdqu 64(%rax), %xmm0
movdqu 80(%rax), %xmm1
movdqu 96(%rax), %xmm4
movdqu 112(%rax), %xmm5
paddb %xmm0, %xmm4
paddb %xmm2, %xmm4
paddb %xmm1, %xmm5
paddb %xmm3, %xmm5
movdqu 128(%rax), %xmm0
movdqu 144(%rax), %xmm1
movdqu 160(%rax), %xmm2
movdqu 176(%rax), %xmm3
paddb %xmm0, %xmm2
paddb %xmm4, %xmm2
paddb %xmm1, %xmm3
paddb %xmm5, %xmm3
paddb %xmm2, %xmm3
pshufd $78, %xmm3, %xmm0
paddb %xmm3, %xmm0
pshufd $229, %xmm0, %xmm1
paddb %xmm0, %xmm1
movdqa %xmm1, %xmm0
psrld $16, %xmm0
paddb %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrlw $8, %xmm1
paddb %xmm0, %xmm1
movdqa %xmm1, -368(%rbp)
movzbl 192(%rax), %ecx
addb -368(%rbp), %cl
addb 193(%rax), %cl
addb 194(%rax), %cl
addb 195(%rax), %cl
addb 196(%rax), %cl
addb 197(%rax), %cl
addb 198(%rax), %cl
addb 199(%rax), %cl
addb %cl, %r15b
movq -144(%rbp), %r14
cmpq $200, %r14
jae LBB0_4
LBB0_7:
movl $5, %esi
leaq _str.8(%rip), %rdi
callq __ZN3std5error205_$LT$impl$u20$core..convert..From$LT$$RF$$u27$b$u20$str$GT$$u20$for$u20$alloc..boxed..Box$LT$std..error..Error$u20$$u2b$$u20$core..marker..Sync$u20$$u2b$$u20$core..marker..Send$u20$$u2b$$u20$$u27$a$GT$$GT$4from17hbabb0ef23dd78831E
movq %rdx, %rcx
movl $17, %esi
leaq -552(%rbp), %rdi
movq %rax, %rdx
callq __ZN3std2io5error5Error4_new17h399b6fceaf2b2ad7E
movzbl -552(%rbp), %eax
leaq -551(%rbp), %rcx
movq %rcx, %rdx
movzbl 6(%rdx), %ecx
movb %cl, -42(%rbp)
movzwl 4(%rdx), %ecx
movw %cx, -44(%rbp)
movl (%rdx), %ecx
movl %ecx, -48(%rbp)
cmpb $2, %al
jb LBB0_12
movq -544(%rbp), %r14
movq 8(%r14), %rdi
movq 16(%r14), %rax
Ltmp0:
callq *(%rax)
Ltmp1:
movq 16(%r14), %rax
movq 8(%rax), %rsi
testq %rsi, %rsi
je LBB0_11
movq 8(%r14), %rdi
movq 16(%rax), %rdx
callq ___rust_deallocate
LBB0_11:
movl $24, %esi
movl $8, %edx
movq %r14, %rdi
callq ___rust_deallocate
LBB0_12:
xorl %eax, %eax
testq %rax, %rax
jne LBB0_6
jmp LBB0_16
.p2align 4, 0x90
LBB0_4:
movq -152(%rbp), %rbx
movzbl 6(%rbx), %eax
movb %al, -50(%rbp)
movzwl 4(%rbx), %eax
movw %ax, -52(%rbp)
movl (%rbx), %eax
movl %eax, -56(%rbp)
movzbl 7(%rbx), %r12d
movzbl 14(%rbx), %eax
movb %al, -42(%rbp)
movzwl 12(%rbx), %eax
movw %ax, -44(%rbp)
movl 8(%rbx), %eax
movl %eax, -48(%rbp)
movq 15(%rbx), %rax
movq %rax, -88(%rbp)
leaq 23(%rbx), %rsi
movl $177, %edx
leaq -552(%rbp), %r13
movq %r13, %rdi
callq _memcpy
addq $200, %rbx
addq $-200, %r14
movq %rbx, -152(%rbp)
movq %r14, -144(%rbp)
movzbl -50(%rbp), %eax
movb %al, -346(%rbp)
movzwl -52(%rbp), %eax
movw %ax, -348(%rbp)
movl -56(%rbp), %eax
movl %eax, -352(%rbp)
movb %r12b, -345(%rbp)
movzbl -42(%rbp), %eax
leaq -344(%rbp), %rcx
movb %al, 6(%rcx)
movzwl -44(%rbp), %eax
movw %ax, 4(%rcx)
movl -48(%rbp), %eax
movl %eax, (%rcx)
movq -88(%rbp), %rax
movq %rax, -337(%rbp)
movl $177, %edx
leaq -329(%rbp), %rdi
movq %r13, %rsi
callq _memcpy
leaq -352(%rbp), %rax
testq %rax, %rax
jne LBB0_6
LBB0_16:
movb %r15b, -352(%rbp)
leaq -352(%rbp), %rax
## InlineAsm Start
## InlineAsm End
movq -80(%rbp), %rax
cmpq -64(%rbp), %rax
jb LBB0_2
LBB0_17:
leaq -112(%rbp), %rdi
leaq -96(%rbp), %rsi
callq __ZN3std4time7Instant7elapsed17h4a152db7d42dfbccE
movq -112(%rbp), %rax
movl -104(%rbp), %ecx
movq %rax, -128(%rbp)
movl %ecx, -120(%rbp)
leaq -128(%rbp), %rdi
callq __ZN4test11ns_from_dur17h77590a35c757ea7bE
addq $520, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
LBB0_13:
Ltmp2:
movq %rax, %rbx
movq 16(%r14), %rax
movq 8(%rax), %rsi
testq %rsi, %rsi
je LBB0_15
movq 8(%r14), %rdi
movq 16(%rax), %rdx
callq ___rust_deallocate
LBB0_15:
movl $24, %esi
movl $8, %edx
movq %r14, %rdi
callq ___rust_deallocate
movq %rbx, %rdi
callq __Unwind_Resume
Lfunc_end0:
.cfi_endproc
.section __TEXT,__gcc_except_tab
.p2align 2
GCC_except_table0:
Lexception0:
.byte 255
.byte 155
.byte 41
.byte 3
.byte 39
Lset0 = Lfunc_begin0-Lfunc_begin0
.long Lset0
Lset1 = Ltmp0-Lfunc_begin0
.long Lset1
.long 0
.byte 0
Lset2 = Ltmp0-Lfunc_begin0
.long Lset2
Lset3 = Ltmp1-Ltmp0
.long Lset3
Lset4 = Ltmp2-Lfunc_begin0
.long Lset4
.byte 0
Lset5 = Ltmp1-Lfunc_begin0
.long Lset5
Lset6 = Lfunc_end0-Ltmp1
.long Lset6
.long 0
.byte 0
.p2align 2
.section __TEXT,__text,regular,pure_instructions
.p2align 4, 0x90
</details>
<details>
<summary>ASM for bench_indirect's closure</summary>
__ZN4test13ns_iter_inner17hba96d7af2bd3d26cE:
Lfunc_begin2:
.cfi_startproc
.cfi_personality 155, _rust_eh_personality
.cfi_lsda 16, Lexception2
pushq %rbp
Lcfi16:
.cfi_def_cfa_offset 16
Lcfi17:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi18:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $312, %rsp
Lcfi19:
.cfi_offset %rbx, -56
Lcfi20:
.cfi_offset %r12, -48
Lcfi21:
.cfi_offset %r13, -40
Lcfi22:
.cfi_offset %r14, -32
Lcfi23:
.cfi_offset %r15, -24
movq %rsi, %rbx
movq %rdi, -56(%rbp)
callq __ZN3std4time7Instant3now17hc5a5b38c2510cf8bE
movq %rax, -64(%rbp)
movq %rbx, -48(%rbp)
testq %rbx, %rbx
je LBB2_9
xorl %r14d, %r14d
leaq -336(%rbp), %r12
.p2align 4, 0x90
LBB2_2:
movq -56(%rbp), %rax
movq (%rax), %rax
movq (%rax), %r15
movq 16(%rax), %rbx
pxor %xmm0, %xmm0
movdqa %xmm0, -160(%rbp)
movdqa %xmm0, -176(%rbp)
movdqa %xmm0, -192(%rbp)
movdqa %xmm0, -208(%rbp)
movdqa %xmm0, -224(%rbp)
movdqa %xmm0, -240(%rbp)
movdqa %xmm0, -256(%rbp)
movdqa %xmm0, -272(%rbp)
movdqa %xmm0, -288(%rbp)
movdqa %xmm0, -304(%rbp)
movdqa %xmm0, -320(%rbp)
movdqa %xmm0, -336(%rbp)
movq $0, -144(%rbp)
movq %r15, -136(%rbp)
movq %rbx, -128(%rbp)
xorl %r13d, %r13d
cmpq $199, %rbx
jbe LBB2_3
.p2align 4, 0x90
LBB2_13:
movl $200, %edx
movq %r12, %rdi
movq %r15, %rsi
callq _memcpy
addq $200, %r15
addq $-200, %rbx
movq %r15, -136(%rbp)
movq %rbx, -128(%rbp)
movdqa -304(%rbp), %xmm0
movdqa -288(%rbp), %xmm1
paddb -336(%rbp), %xmm0
paddb -320(%rbp), %xmm1
paddb -272(%rbp), %xmm0
paddb -256(%rbp), %xmm1
paddb -240(%rbp), %xmm0
paddb -224(%rbp), %xmm1
paddb -208(%rbp), %xmm0
paddb -192(%rbp), %xmm1
paddb -176(%rbp), %xmm0
paddb -160(%rbp), %xmm1
paddb %xmm0, %xmm1
pshufd $78, %xmm1, %xmm0
paddb %xmm1, %xmm0
pshufd $229, %xmm0, %xmm1
paddb %xmm0, %xmm1
movdqa %xmm1, %xmm0
psrld $16, %xmm0
paddb %xmm1, %xmm0
movdqa %xmm0, %xmm1
psrlw $8, %xmm1
paddb %xmm0, %xmm1
movdqa %xmm1, -352(%rbp)
movzbl -144(%rbp), %eax
addb -352(%rbp), %al
addb -143(%rbp), %al
addb -142(%rbp), %al
addb -141(%rbp), %al
addb -140(%rbp), %al
addb -139(%rbp), %al
addb -138(%rbp), %al
addb -137(%rbp), %al
addb %al, %r13b
cmpq $200, %rbx
jae LBB2_13
LBB2_3:
movl $5, %esi
leaq _str.8(%rip), %rdi
callq __ZN3std5error205_$LT$impl$u20$core..convert..From$LT$$RF$$u27$b$u20$str$GT$$u20$for$u20$alloc..boxed..Box$LT$std..error..Error$u20$$u2b$$u20$core..marker..Sync$u20$$u2b$$u20$core..marker..Send$u20$$u2b$$u20$$u27$a$GT$$GT$4from17hbabb0ef23dd78831E
movq %rdx, %rcx
movl $17, %esi
leaq -112(%rbp), %rdi
movq %rax, %rdx
callq __ZN3std2io5error5Error4_new17h399b6fceaf2b2ad7E
cmpb $2, -112(%rbp)
jb LBB2_8
movq -104(%rbp), %rbx
movq 8(%rbx), %rdi
movq 16(%rbx), %rax
Ltmp14:
callq *(%rax)
Ltmp15:
movq 16(%rbx), %rax
movq 8(%rax), %rsi
testq %rsi, %rsi
je LBB2_7
movq 8(%rbx), %rdi
movq 16(%rax), %rdx
callq ___rust_deallocate
LBB2_7:
movl $24, %esi
movl $8, %edx
movq %rbx, %rdi
callq ___rust_deallocate
LBB2_8:
incq %r14
movb %r13b, -336(%rbp)
## InlineAsm Start
## InlineAsm End
cmpq -48(%rbp), %r14
jb LBB2_2
LBB2_9:
leaq -80(%rbp), %rdi
leaq -64(%rbp), %rsi
callq __ZN3std4time7Instant7elapsed17h4a152db7d42dfbccE
movq -80(%rbp), %rax
movl -72(%rbp), %ecx
movq %rax, -96(%rbp)
movl %ecx, -88(%rbp)
leaq -96(%rbp), %rdi
callq __ZN4test11ns_from_dur17h77590a35c757ea7bE
addq $312, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
LBB2_10:
Ltmp16:
movq %rax, %r14
movq 16(%rbx), %rax
movq 8(%rax), %rsi
testq %rsi, %rsi
je LBB2_12
movq 8(%rbx), %rdi
movq 16(%rax), %rdx
callq ___rust_deallocate
LBB2_12:
movl $24, %esi
movl $8, %edx
movq %rbx, %rdi
callq ___rust_deallocate
movq %r14, %rdi
callq __Unwind_Resume
Lfunc_end2:
.cfi_endproc
.section __TEXT,__gcc_except_tab
.p2align 2
GCC_except_table2:
Lexception2:
.byte 255
.byte 155
.byte 41
.byte 3
.byte 39
Lset22 = Lfunc_begin2-Lfunc_begin2
.long Lset22
Lset23 = Ltmp14-Lfunc_begin2
.long Lset23
.long 0
.byte 0
Lset24 = Ltmp14-Lfunc_begin2
.long Lset24
Lset25 = Ltmp15-Ltmp14
.long Lset25
Lset26 = Ltmp16-Lfunc_begin2
.long Lset26
.byte 0
Lset27 = Ltmp15-Lfunc_begin2
.long Lset27
Lset28 = Lfunc_end2-Ltmp15
.long Lset28
.long 0
.byte 0
.p2align 2
.section __TEXT,__literal16,16byte_literals
.p2align 4
LCPI3_0:
.long 1127219200
.long 1160773632
.long 0
.long 0
LCPI3_1:
.quad 4841369599423283200
.quad 4985484787499139072
.section __TEXT,__literal8,8byte_literals
.p2align 3
LCPI3_2:
.quad 4617315517961601024
LCPI3_3:
.quad 4607182418800017408
.section __TEXT,__text,regular,pure_instructions
.p2align 4, 0x90
</details>For reference, this is basically a simplified version of what webrender parsing its display lists with bincode looks like.
Aria Desires at 2017-05-16 20:28:10
I expect the bottleneck is that the indirect and direct versions appear to have very different error semantics without a full picture: direct preserves the old value on Err, while indirect partially overwrites the old value on Err. In our case we simply don't care what the value is on Err, so the two are semantically identical. However knowing this requires fairly strong alias information.
Aria Desires at 2017-05-16 21:15:33
On what hardware were the benchmark results obtained?
Sander Maijers at 2017-05-17 08:33:19
Reverting #31545 improves things for me from: test bench_direct ... bench: 125,023 ns/iter (+/- 694) test bench_direct_boxed ... bench: 116,033 ns/iter (+/- 768) test bench_indirect ... bench: 75,778 ns/iter (+/- 711)
to:
test bench_direct ... bench: 92,552 ns/iter (+/- 4,365) test bench_direct_boxed ... bench: 107,775 ns/iter (+/- 345) test bench_indirect ... bench: 75,286 ns/iter (+/- 49)
Jeff Muizelaar at 2017-07-19 16:59:35
Assuming we can write directly in the return destination, I came up with different benchmarks:
<details> <summary>Click for code</summary>
</details>#![feature(test, box_syntax)] extern crate test; use std::io::{Write, Error, ErrorKind}; const DATA_SIZE: usize = 200; #[derive(Copy, Clone)] struct Huge { data: [u8; DATA_SIZE], } struct HugeIter<'a> { cur_val: Huge, buf: &'a [u8], } impl<'a> HugeIter<'a> { fn next(&mut self) -> Option<&Huge> { if let Ok(()) = parse_huge(&mut self.buf, &mut self.cur_val) { Some(&self.cur_val) } else { None } } fn next_roundtrip_caller(&mut self) -> Option<&Huge> { unsafe { let mut val = Huge { data: ::std::mem::uninitialized() }; let result = match parse_huge(&mut self.buf, &mut val) { Ok(()) => Ok(val), Err(e) => Err(e) }; if let Ok(ref val) = result { self.cur_val = *val; Some(&self.cur_val) } else { None } } } } struct HugeResultIter<'a> { cur_val: Result<Huge, Error>, buf: &'a [u8], } impl<'a> HugeResultIter<'a> { fn next(&mut self) -> Option<&Huge> { parse_huge_result(&mut self.buf, &mut self.cur_val); self.cur_val.as_ref().ok() } fn next_copy(&mut self) -> Option<&Huge> { parse_huge_result_copy(&mut self.buf, &mut self.cur_val); self.cur_val.as_ref().ok() } fn next_copy_return(&mut self) -> Option<&Huge> { self.cur_val = parse_huge_result_copy_return(&mut self.buf); self.cur_val.as_ref().ok() } } fn parse_huge(src: &mut &[u8], dest: &mut Huge) -> Result<(), Error> { if src.len() < DATA_SIZE { return Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")) } (&mut dest.data[..]).write_all(&src[..DATA_SIZE])?; *src = &src[DATA_SIZE..]; Ok(()) } fn parse_huge_result(src: &mut &[u8], dest: &mut Result<Huge, Error>) { if src.len() < DATA_SIZE { *dest = Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")); return; } let mut result = Ok(()); if let Ok(ref mut val) = *dest { result = (&mut val.data[..]).write_all(&src[..DATA_SIZE]); *src = &src[DATA_SIZE..]; } if let Err(e) = result { *dest = Err(e); } } fn parse_huge_result_copy(src: &mut &[u8], dest: &mut Result<Huge, Error>) { if src.len() < DATA_SIZE { *dest = Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")); return; } unsafe { let mut val = Huge { data: ::std::mem::uninitialized() }; let result = (&mut val.data[..]).write_all(&src[..DATA_SIZE]); if let Err(e) = result { *dest = Err(e); } else { *dest = Ok(val); } *src = &src[DATA_SIZE..]; } } fn parse_huge_result_copy_return(src: &mut &[u8]) -> Result<Huge, Error> { if src.len() < DATA_SIZE { return Err(Error::new(ErrorKind::UnexpectedEof, "OH NO")); } unsafe { let mut val = Huge { data: ::std::mem::uninitialized() }; (&mut val.data[..]).write_all(&src[..DATA_SIZE])?; *src = &src[DATA_SIZE..]; Ok(val) } } #[bench] fn bench_huge(b: &mut test::Bencher) { let data = test::black_box(vec![0; 1_000_000]); b.iter(|| { let mut iter = HugeIter { cur_val: Huge { data: [0; 200] }, buf: &data }; let mut total: u8 = 0; while let Some(val) = iter.next() { total += val.data[..].iter().cloned().sum::<u8>(); } total }); } #[bench] fn bench_huge_roundtrip_caller(b: &mut test::Bencher) { let data = test::black_box(vec![0; 1_000_000]); b.iter(|| { let mut iter = HugeIter { cur_val: Huge { data: [0; 200] }, buf: &data }; let mut total: u8 = 0; while let Some(val) = iter.next_roundtrip_caller() { total += val.data[..].iter().cloned().sum::<u8>(); } total }); } #[bench] fn bench_huge_result(b: &mut test::Bencher) { let data = test::black_box(vec![0; 1_000_000]); b.iter(|| { let mut iter = HugeResultIter { cur_val: Ok(Huge { data: [0; 200] }), buf: &data }; let mut total: u8 = 0; while let Some(val) = iter.next() { total += val.data[..].iter().cloned().sum::<u8>(); } total }); } #[bench] fn bench_huge_result_copy(b: &mut test::Bencher) { let data = test::black_box(vec![0; 1_000_000]); b.iter(|| { let mut iter = HugeResultIter { cur_val: Ok(Huge { data: [0; 200] }), buf: &data }; let mut total: u8 = 0; while let Some(val) = iter.next_copy() { total += val.data[..].iter().cloned().sum::<u8>(); } total }); } #[bench] fn bench_huge_result_copy_return(b: &mut test::Bencher) { let data = test::black_box(vec![0; 1_000_000]); b.iter(|| { let mut iter = HugeResultIter { cur_val: Ok(Huge { data: [0; 200] }), buf: &data }; let mut total: u8 = 0; while let Some(val) = iter.next_copy_return() { total += val.data[..].iter().cloned().sum::<u8>(); } total }); }test bench_huge ... bench: 88,330 ns/iter (+/- 2,495) test bench_huge_result ... bench: 154,788 ns/iter (+/- 8,861) test bench_huge_result_copy ... bench: 168,051 ns/iter (+/- 4,143) test bench_huge_result_copy_return ... bench: 151,590 ns/iter (+/- 11,387) test bench_huge_roundtrip_caller ... bench: 145,835 ns/iter (+/- 4,674)The difference between
bench_huge_resultandbench_huge_result_copy{,_return}is that the latter two write into a local variable and then copy that variable into the(dest as Ok).0field, which we should be able to optimize at the MIR level. That, combined with an ABI that passesA(T) | B(U)enums as either two return pointers and abooltag, or a return pointer for the larger ofTandUwhen the other one is immediate and has a niche (e.g. non-nullable pointer), on a case-by-case basis, should provide the necessary speedup.EDIT: For doing the
A(T) | B(U)optimization, we might need to have a destructuring assignment in MIR calls, instead of a singleLvaluedestination, or some other encoding, because by the time it gets to LLVM, the destination propagation ~~might have~~ has to have been performed.EDIT2: I've added
bench_huge_roundtrip_callerwhich shows that LLVM can't optimize out the temporaryResultin the caller to pass the final destination in the call.Eduard-Mihai Burtescu at 2017-11-15 13:28:40
Yes, that matches what I'd been thinking the solution would be. (not sure on the precise MIR details)
To be more concrete:
fn callee()->Result<T, E>;would lower to (using an &uninit to represent a pointer that doesn't have a destructor)
fn callee_abi(ok: &uninit T, err: &uninit E) -> bool;So that the callee knows that it can freely clobber
*okand*err, and only one will be interpretted if the function returns the corresponding bool.However the hard part isn't the callee, but rather the caller. It needs to prove that it's semantically valid to feed its locals into such an ABI. Consider for instance:
fn caller() -> Result<(), E> { let mut x = vec![1, 2, 3]; println!("{:?}", x); x = callee()?; println!("{:?}", x); }Naively this desugars to:
fn caller_abi(err: &uninit E) -> bool { let mut x = vec![1, 2, 3]; println!("{:?}", x); let temp_ok; if !callee_abi(&uninit temp_ok, err) { return false; } drop(x); // Drop the old value *after* the call! x = temp_ok; println!("{:?}", x); drop(x); return true; }But what we really want is:
fn caller_abi(err: &uninit E) -> bool { let mut x = vec![1, 2, 3]; println!("{:?}", x); drop(x); // Drop the old value *before* the call if !callee_abi(&uninit x, err) { return false; } println!("{:?}", x); drop(x); return true; }Specifically Rust isn't within its rights to arbitrarily reorder destructors between calls, especially when
xchanges from a local to a field of&mut self, and especially when unwinding is possible.Currently the real-world use-case that's motivating this issue (webrender's display list deserializing iterator) has several nice properties that make this transform easier:
- panic=abort
- POD
But the last case isn't something I can fully guarantee in the future. In particular we might add a (rare) case to our top-level enum that has a destructor, making the whole thing have a destructor. It's possible we could do a match to check if the enum is "currently POD" ("dynamically needs_drop") and then take a fast no-copy path or a slow path with a temporary? But that's a really sketchy application-specific optimization.
Aria Desires at 2017-11-15 16:51:13
Oh you will also need to answer the question of if the
okanderrpayloads may alias, which would allow:fn foo() -> Result<T, E>; let result = box foo();to be lowered to
let result = malloc(); let is_ok = foo_abi(&uninit result.repr.ok_payload, &uninit result.repr.err_payload); result.repr.tag = if is_ok { Ok } else { Err };On balance I think the answer should be "no" (would probably pessimize the callee too much?)
Aria Desires at 2017-11-15 18:24:40
Potential steps that we could take to achieve this:
pub fn foo() -> u64 { let mut x = 0; // Doesn't currently propagate destination because of the borrow. drop(&mut x); x }pub struct Newtype<T>(T); pub fn foo() -> Newtype<u64> { let mut x = 0; drop(&mut x); // Does not currently propagate the destination because of the field projection. Newtype(x) }The last example should generalize to all enums (random note: deaggregation doesn't work on tuples for some reason? maybe it doesn't work on arrays either).
EDIT: wait, I said "doesn't currently", but we don't seem to actually have any kind of destination propagation MIR optimization.
Eduard-Mihai Burtescu at 2017-11-16 13:09:36
@Gankro If they can't alias we can't actually use this with a regular
Result<T, E>layout and we end up with(bool, T, E)on stack. I think writes should be exclusive, what are you thinking about?Eduard-Mihai Burtescu at 2017-11-16 13:12:17
So I've been working on a
Move/Copysplit ofOperand::Consumethat @nikomatsakis suggested, and if we assign the semantics ofMovethat it invalidates all borrows, we can be more aggressive with optimizations for non-Copytypes than LLVM, some of them even straightforwardly so.Eduard-Mihai Burtescu at 2017-11-17 17:00:52
So here's the best counter-argument I can come up with for aliasing T and E:
fn foo() -> Result<Vec<T>, E> { let mut result = vec![]; for x in values() { // I forget how placers work but I want to use them for p e r f result.place_back() <- try_process(x)?; } Ok(result) }lowers to:
fn foo_abi(ok: &uninit Vec<T>, err: &uninit E) -> bool { *ok = vec![]; for x in values() { let place = result.place_back(); let temp_err; if !try_process_abi(&place, &temp_err, x) { // Can't construct in-place because we can't reorder drop of `ok` let temp_err2 = convert_into_other_type(temp_err); drop_in_place(ok); *err = temp_err2; return false; } return true; }In this example we see
okneeds_drop, and it's destroyed at the end of the function. This prevents us from ever constructing the error in-place (unless we can prove reordering the destructor with the side-effects of convert_into_type and/or try_process is sound). How important this is is unclear to me.Aria Desires at 2017-11-17 17:17:33
FWIW I hope not to expose the separate locations which would hamper that optimization indeed. However, that's closer to today than guaranteeing the variants don't overlap, and I don't mind being conservative while trying such a scheme out.
Eduard-Mihai Burtescu at 2017-11-17 21:36:50
~~An experimental "lvalue reuse" (aka destination propagation aka NRVO) optimization limited to locals (not fields thereof etc.) puts
bench_huge_resultclose tobench_huge(both from https://github.com/rust-lang/rust/issues/42047#issuecomment-344592076).~~EDIT: I just realized that the measurements in that comment aren't what I remember, that's weird.
Eduard-Mihai Burtescu at 2017-11-28 05:59:27