rust can't serialize 11 fields efficiently
Using noalias (#45012) lets rust generate much better code for the serialization of 10 fields in good_bake_bytes() however it falls back to terrible with the 11 fields of bad_bake_bytes()
use std::io::Write;
use std::{io, ptr};
struct UnsafeVecWriter<'a>(&'a mut Vec<u8>);
impl<'a> Write for UnsafeVecWriter<'a> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
unsafe {
let old_len = self.0.len();
self.0.set_len(old_len + buf.len());
ptr::copy_nonoverlapping(buf.as_ptr(), self.0.as_mut_ptr().offset(old_len as isize), buf.len());
}
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> { Ok(()) }
}
struct Entity {
o: (f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32),
}
use std::mem::transmute;
fn do_f32<W: Write>(w: &mut W, x: f32) {
unsafe {
let p: [u8; 4] = std::mem::transmute([x]);
w.write(&p);
}
}
#[inline(never)]
fn bad_bake_bytes(vec: &mut Vec<u8>, e: &Entity) {
let w = &mut UnsafeVecWriter(vec);
do_f32(w, e.o.0);
do_f32(w, e.o.1);
do_f32(w, e.o.2);
do_f32(w, e.o.3);
do_f32(w, e.o.4);
do_f32(w, e.o.5);
do_f32(w, e.o.6);
do_f32(w, e.o.7);
do_f32(w, e.o.8);
do_f32(w, e.o.9);
do_f32(w, e.o.10);
}
#[inline(never)]
fn good_bake_bytes(vec: &mut Vec<u8>, e: &Entity) {
let w = &mut UnsafeVecWriter(vec);
do_f32(w, e.o.0);
do_f32(w, e.o.1);
do_f32(w, e.o.2);
do_f32(w, e.o.3);
do_f32(w, e.o.4);
do_f32(w, e.o.5);
do_f32(w, e.o.6);
do_f32(w, e.o.7);
do_f32(w, e.o.8);
do_f32(w, e.o.9);
//do_f32(w, e.o.10);
}
fn main() {
let mut encoded = Vec::new();
let decoded: Entity = unsafe { std::mem::uninitialized() };
bad_bake_bytes(&mut encoded, &decoded);
good_bake_bytes(&mut encoded, &decoded);
}
__ZN10serde_fast14bad_bake_bytes17h506e94e6df0b1a3bE:
.cfi_startproc
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
movl (%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 4(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 8(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 12(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 16(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 20(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 24(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 28(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 32(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 36(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
movl 40(%rsi), %eax
movq 16(%rdi), %rcx
leaq 4(%rcx), %rdx
movq %rdx, 16(%rdi)
movq (%rdi), %rdx
movl %eax, (%rdx,%rcx)
popq %rbp
retq
.cfi_endproc
.p2align 4, 0x90
__ZN10serde_fast15good_bake_bytes17h3098644f875a0da3E:
.cfi_startproc
pushq %rbp
Lcfi3:
.cfi_def_cfa_offset 16
Lcfi4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi5:
.cfi_def_cfa_register %rbp
movl (%rsi), %eax
movq (%rdi), %rcx
movq 16(%rdi), %rdx
movl %eax, (%rcx,%rdx)
movl 4(%rsi), %eax
movl %eax, 4(%rcx,%rdx)
movl 8(%rsi), %eax
movl %eax, 8(%rcx,%rdx)
movl 12(%rsi), %eax
movl %eax, 12(%rcx,%rdx)
movl 16(%rsi), %eax
movl %eax, 16(%rcx,%rdx)
movl 20(%rsi), %eax
movl %eax, 20(%rcx,%rdx)
movl 24(%rsi), %eax
movl %eax, 24(%rcx,%rdx)
movl 28(%rsi), %eax
movl %eax, 28(%rcx,%rdx)
movl 32(%rsi), %eax
movl %eax, 32(%rcx,%rdx)
movl 36(%rsi), %eax
leaq 40(%rdx), %rsi
movq %rsi, 16(%rdi)
movl %eax, 36(%rcx,%rdx)
popq %rbp
retq
.cfi_endproc
Wild guess: A larger function trips the inlining thresholds differently, causing a pass ordering issue.
Patrick Walton at 2017-10-06 16:31:15
Running llvm trunk opt doesn't fix it. @sunfishcode any ideas what might be happening here?
Jeff Muizelaar at 2017-10-06 17:08:23
I'm looking into it (building a compiler with the appropriate patch now). In the mean time, just to be clear, the problem here is the
movq 16(%rdi), %rcx leaq 4(%rcx), %rdx movq %rdx, 16(%rdi)updating the length field of the
Vecafter each element is written in the "bad" version. In the "good" version, LLVM figures out that the length field isn't written to by the other stores, so it's able to combine all the increments and do a single store at the end.Dan Gohman at 2017-10-06 17:25:33
Exactly.
Jeff Muizelaar at 2017-10-06 17:47:08
It's due to the hard-coded limit in CaptureTracking here.
The code does this:
- load the buffer pointer from the Vec
- store the first float to the buffer
- load the length from the Vec
and the key question is whether that store writes to the memory read by the following load.
The
noaliason the argument isn't sufficient; LLVM needs to be sure that the address of the Vec hasn't escaped, such that it could have become stored in the buffer pointer field. CaptureTracking walks through the uses of the Vec's address, however, in thebad_bake_bytescase, it hits the threshold before it sees all of them, and assumes the worst.Dan Gohman at 2017-10-06 20:19:52
Is there something we can do to make llvm's job easier?
Jeff Muizelaar at 2017-10-06 20:47:54
Could we up the limit as a compiler option here?
For example, MemorySSA has a compiler option for this: http://llvm.org/doxygen/MemorySSA_8cpp.html#a5926ddc0f7c4225c6ced440baa2fb7a3
I know this is the kind of thing we can't select a good value for in general, but for WebRender we could select a value that we know always produces good serde codegen for us.
Patrick Walton at 2017-10-06 21:22:54
@jrmuizel Doing this optimization ahead of time in rustc on MIR would help. It's easier for rustc to tell that pointers don't alias.
Unfortunately, MIR optimizations are missing a lot of basic infrastructure to enable this kind of thing; for example, they can't do inlining or SROA yet…
Patrick Walton at 2017-10-06 21:26:56
I'll write the LLVM patch to add an option if nobody objects.
Patrick Walton at 2017-10-06 21:29:35
Patch is up: https://reviews.llvm.org/D38648
Patrick Walton at 2017-10-06 22:18:53
In the mean time I can work around this by making it so that instead of modifying length every write we just move the pointer and compute the length at the end.
struct UnsafeVecWriter(*mut u8); impl Write for UnsafeVecWriter { fn write(&mut self, buf: &[u8]) -> io::Result<usize> { unsafe { ptr::copy_nonoverlapping(buf.as_ptr(), self.0, buf.len()); self.0 = self.0.offset(buf.len() as isize); } Ok(buf.len()) } fn flush(&mut self) -> io::Result<()> { Ok(()) } }Jeff Muizelaar at 2017-10-06 22:37:39
@jrmuizel Does WR have good serialization codegen now?
Patrick Walton at 2017-10-06 23:26:11
It's pretty good but could be better. If I run opt -O2 on the ir it improves: before:
movl (%rsi), %ecx movl %ecx, (%rax) movl 4(%rsi), %ecx movl %ecx, 4(%rax) movl 8(%rsi), %ecx movl %ecx, 8(%rax) movl 12(%rsi), %ecx movl %ecx, 12(%rax) movl 16(%rsi), %ecx movl %ecx, 16(%rax) movl 20(%rsi), %ecx movl %ecx, 20(%rax) movl 24(%rsi), %ecx movl %ecx, 24(%rax) movl 28(%rsi), %ecx movl %ecx, 28(%rax) movl 32(%rsi), %ecx movl %ecx, 32(%rax) movl 36(%rsi), %ecx movl %ecx, 36(%rax) movl 40(%rsi), %ecx movl %ecx, 40(%rax)after:
movups (%rsi), %xmm0 movups %xmm0, (%rax) movups 16(%rsi), %xmm0 movups %xmm0, 16(%rax) movl 32(%rsi), %ecx movl %ecx, 32(%rax) movl 36(%rsi), %ecx movl %ecx, 36(%rax) movl 40(%rsi), %ecx movl %ecx, 40(%rax)Jeff Muizelaar at 2017-10-07 02:19:01
Seems like a pass ordering issue. Perhaps the new pass manager in LLVM will help someday…
Or we could fiddle with the pass ordering ourselves.
Patrick Walton at 2017-10-07 16:00:35
FWIW, it looks like the movups vs movl 24(%rsi), %ecx problem can be solved by building with opt-level=3
Jeff Muizelaar at 2017-10-31 18:35:07
I've filed https://bugzilla.mozilla.org/show_bug.cgi?id=1413285 on Mozilla side about this.
Jeff Muizelaar at 2017-10-31 18:51:43
https://rust.godbolt.org/z/hfG734fGf
Replacing tuple with array and iterating over it eliminates bad codegen, even with
-O.klensy at 2022-05-30 14:38:38
Looks like rustc nightly fixed this problem:
https://godbolt.org/z/x5M61jd9G
Evgeniy Dushistov at 2024-12-14 11:40:56