LLVM's pass ordering chokes on zero-cost abstractions.
The latest example of this I've seen was posted by @Veedrac to Hacker News (try on playground):
static table: [i32; 4] = [0; 4];
pub fn exists_in_table(v: i32) -> bool {
for &x in table.iter() {
if x == v {
return true;
}
}
false
}
After -C opt-level=3 the ~600 lines of unoptimized IR turn into this (note br i1 false):
; playground::exists_in_table
; Function Attrs: nounwind uwtable
define zeroext i1 @_ZN10playground15exists_in_table17h0a39bf7934b9be37E(i32) unnamed_addr #0 {
start:
br label %bb3
bb3: ; preds = %start
br i1 false, label %bb9, label %bb6
bb6: ; preds = %bb3
%1 = icmp eq i32 %0, 0
br i1 %1, label %bb9, label %bb3.1
bb9: ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6
%_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ]
ret i1 %_0.0
bb3.1: ; preds = %bb6
br i1 false, label %bb9, label %bb6.1
bb6.1: ; preds = %bb3.1
%2 = icmp eq i32 %0, 0
br i1 %2, label %bb9, label %bb3.2
bb3.2: ; preds = %bb6.1
br i1 false, label %bb9, label %bb6.2
bb6.2: ; preds = %bb3.2
%3 = icmp eq i32 %0, 0
br i1 %3, label %bb9, label %bb3.3
bb3.3: ; preds = %bb6.2
br i1 false, label %bb9, label %bb6.3
bb6.3: ; preds = %bb3.3
%4 = icmp eq i32 %0, 0
br i1 %4, label %bb9, label %bb3.4
bb3.4: ; preds = %bb6.3
br i1 true, label %bb9, label %bb6.4
bb6.4: ; preds = %bb3.4
br label %bb9
}
cc https://github.com/rust-lang/rust/issues/33299
Simonas Kazlauskas at 2017-08-29 10:56:21
I'm very new to llvm and compilers but I was trying to see what the problem was with this so I was running:
rustc -Cllvm-args='-print-after-all' -O --emit llvm-ir badopt.rs |& rustfiltand get a part of the log that looks like this:*** IR Dump After Loop-Closed SSA Form Pass *** ; Function Attrs: nounwind uwtable define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 { start: br label %bb3 bb3: ; preds = %bb6, %start %iter.sroa.0.0 = phi i32* [ getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 0, i64 0), %start ], [ %2, %bb6 ] %1 = icmp eq i32* %iter.sroa.0.0, getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 1, i64 0) br i1 %1, label %bb9, label %bb6 bb6: ; preds = %bb3 %2 = getelementptr inbounds i32, i32* %iter.sroa.0.0, i64 1 %3 = load i32, i32* %iter.sroa.0.0, align 4 %4 = icmp eq i32 %3, %0 br i1 %4, label %bb9, label %bb3 bb9: ; preds = %bb3, %bb6 %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ] ret i1 %_0.0 } *** IR Dump After Combine redundant instructions *** ; Function Attrs: nounwind uwtable define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 { start: br label %bb3 bb3: ; preds = %start br i1 false, label %bb9, label %bb6 bb6: ; preds = %bb3 %1 = icmp eq i32 %0, 0 br i1 %1, label %bb9, label %bb3.1 bb9: ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6 %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ] ret i1 %_0.0 bb3.1: ; preds = %bb6 br i1 false, label %bb9, label %bb6.1 bb6.1: ; preds = %bb3.1 %2 = icmp eq i32 %0, 0 br i1 %2, label %bb9, label %bb3.2 bb3.2: ; preds = %bb6.1 br i1 false, label %bb9, label %bb6.2 bb6.2: ; preds = %bb3.2 %3 = icmp eq i32 %0, 0 br i1 %3, label %bb9, label %bb3.3 bb3.3: ; preds = %bb6.2 br i1 false, label %bb9, label %bb6.3 bb6.3: ; preds = %bb3.3 %4 = icmp eq i32 %0, 0 br i1 %4, label %bb9, label %bb3.4 bb3.4: ; preds = %bb6.3 br i1 true, label %bb9, label %bb6.4 bb6.4: ; preds = %bb3.4 br label %bb9 }after the Combine redundant instructions (instcombine) pass there is no more Simplify the CFG (simplifycfg). To me it looks like instcombine change the CFG. But the instcombine pass description states that "This pass does not modify the CFG." is it possible that this is the problem?
Andreas Jonson at 2017-08-30 06:10:05
@andjo403 Oh wow, that is very useful, thanks! I thought I was wrong in assuming that a very complex interaction between passes causes this result. Those are consecutive passes, right?
I don't understand how there's more basic blocks after instcombine. Looking more at it, the
.1,.2,.3and.4suffixes suggest loop unrolling, but instcombine shouldn't do that AFAIK.Eduard-Mihai Burtescu at 2017-08-30 06:36:00
I added -print-before-all also and then there is an Unroll loops between the passes from the log before so instcombine is not the problem. new log:
*** IR Dump Before Unroll loops *** bb3: ; preds = %bb6, %start %iter.sroa.0.0 = phi i32* [ getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 0, i64 0), %start ], [ %2, %bb6 ] %1 = icmp eq i32* %iter.sroa.0.0, getelementptr inbounds ([4 x i32], [4 x i32]* @badopt::table, i64 1, i64 0) br i1 %1, label %bb9, label %bb6 bb6: ; preds = %bb3 %2 = getelementptr inbounds i32, i32* %iter.sroa.0.0, i64 1 %3 = load i32, i32* %iter.sroa.0.0, align 4 %4 = icmp eq i32 %3, %0 br i1 %4, label %bb9, label %bb3 *** IR Dump Before Combine redundant instructions *** ; Function Attrs: nounwind uwtable define zeroext i1 @badopt::exists_in_table(i32) unnamed_addr #0 { start: br label %bb3 bb3: ; preds = %start br i1 false, label %bb9, label %bb6 bb6: ; preds = %bb3 %1 = icmp eq i32 0, %0 br i1 %1, label %bb9, label %bb3.1 bb9: ; preds = %bb6.4, %bb3.4, %bb6.3, %bb3.3, %bb6.2, %bb3.2, %bb6.1, %bb3.1, %bb3, %bb6 %_0.0 = phi i1 [ true, %bb6 ], [ false, %bb3 ], [ false, %bb3.1 ], [ true, %bb6.1 ], [ false, %bb3.2 ], [ true, %bb6.2 ], [ false, %bb3.3 ], [ true, %bb6.3 ], [ false, %bb3.4 ], [ true, %bb6.4 ] ret i1 %_0.0 bb3.1: ; preds = %bb6 br i1 false, label %bb9, label %bb6.1 bb6.1: ; preds = %bb3.1 %2 = icmp eq i32 0, %0 br i1 %2, label %bb9, label %bb3.2 bb3.2: ; preds = %bb6.1 br i1 false, label %bb9, label %bb6.2 bb6.2: ; preds = %bb3.2 %3 = icmp eq i32 0, %0 br i1 %3, label %bb9, label %bb3.3 bb3.3: ; preds = %bb6.2 br i1 false, label %bb9, label %bb6.3 bb6.3: ; preds = %bb3.3 %4 = icmp eq i32 0, %0 br i1 %4, label %bb9, label %bb3.4 bb3.4: ; preds = %bb6.3 br i1 true, label %bb9, label %bb6.4 bb6.4: ; preds = %bb3.4 br label %bb9 }apparently Unroll loops do not make a IR dump after the pass strange.
Andreas Jonson at 2017-08-30 06:50:21
Okay, so the question is why would loop unrolling not have a CFG simplification pass after it?
Eduard-Mihai Burtescu at 2017-08-30 07:04:33
This seems like the right combination of passes: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L782-L790
Also good here: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L620-L625
This on the other hand looks bad: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L374
And this is probably the one that unrolls the loop here: https://github.com/rust-lang/llvm/blob/d9e7d2696e41983b6b5a0b4fac604b4e548a84d3/lib/Transforms/IPO/PassManagerBuilder.cpp#L629-L632
Eduard-Mihai Burtescu at 2017-08-30 07:10:21
So @andjo403 and I spoke about this at lunch the other day, and I took some time today to just add a CFG simplification pass at the places you outlined above to see what happened.
Turns out, adding it on line 633 after the unroll/instruction simplification fixes the issue. The resulting IR for the function reduces to:
; Function Attrs: nounwind uwtable define zeroext i1 @_ZN6static15exists_in_table17hbdcb2c4e00b87deaE(i32) unnamed_addr #0 { start: %1 = icmp eq i32 %0, 0 %. = select i1 %1, i1 true, i1 false ret i1 %. }which is what we want, I believe.
I suspect that the unroll on line 374 may in fact not be a problem thanks to the CFG simplification pass that follows it on line 382. (Unless the LoadCombine/DCE passes that run in between cause changes that negatively affect SimplifyCFG. I know nothing about LLVM internals, but my hunch is that it's unlikely.)
Fredrik Larsson at 2017-08-31 19:49:41
That's great!
I can't help but notice, though, that the IR you showed could be simplified further by instcombine. Maybe it would make sense to add the simplifyCFG before the instcombine, or add another instcombine afterwards?
Hanna Kruppe at 2017-08-31 19:56:36
I'll run a couple of builds trying that out.
Fredrik Larsson at 2017-08-31 20:02:14
Nice, simply moving SimplifyCFG up before InstCombine gives this:
; Function Attrs: nounwind uwtable define zeroext i1 @_ZN6static15exists_in_table17hbdcb2c4e00b87deaE(i32) unnamed_addr #0 { start: %1 = icmp eq i32 %0, 0 ret i1 %1 }Adding another InstCombine after the SimplifyCFG from my previous attempt gives the same output.
Not sure if there are situations where the longer incantation of Unroll>InstCombine>SimplifyCFG>InstCombine would be preferable to just Unroll>SimplifyCFG>InstCombine.
Fredrik Larsson at 2017-08-31 20:34:38
Yeah I'm unsure as well. Running more passes for no good reason should be avoided, but InstCombine can also unlock opportunities for CFG simplification (by turning branch conditions into constants), so in general there no single right order. In fact, some of the PMB pieces @eddyb linked above run InstCombine>SimplifyCFG and others run SimplifyCFG>InstCombine (though none run IC>SimplifyCFG>IC, IIRC). Maybe LLVM developers have an idea? (Has someone already filed an LLVM issue for this?)
Hanna Kruppe at 2017-08-31 20:42:26
I was doing some legwork whilst writing an LLVM bug for this, and noticed that on LLVM trunk, they've added a SimplifyCFG pass at the very end of the function which I modified above (populateModulePassManager).
It was added in this change: https://reviews.llvm.org/rL301395
I made the corresponding change in the Rust tree, and the end result is the second to last LLVM-IR above, with the icmp and select. So not fully optimal but at least much better. I guess Rust would get this change once LLVM 5.0 is in the tree.
Is there still a point to filing an LLVM bug/request for comment on this? Supposedly the suggestion would then probably be to add another InstCombine after the last SimplifyCFG, which should result in the optimal case. If yes I'll try to get one posted tomorrow.
Fredrik Larsson at 2017-09-06 20:22:27
We could bring up whether it might make sense to run SimplifyCFG before instcombine, but tbh it's probably not worth the trouble. @eddyb has had some thoughts about more principled ways to ensure certain simplifications are reliably applied. Such solutions would help all programs, rather than plugging one hole at a time.
I also heard sentiments from LLVM developers that instcombine is already run a bit too often for their tastes, so "pls spam this pass some more so my silly test program is marginally faster" may be dismissed on that grounds as well.
Hanna Kruppe at 2017-09-09 12:46:17
Hi, I'm one of the developers who thinks Instcombine went a little too far :) FWIW, here's my thought (I wrote on the llvm-dev mailing list, but let me copy here in case)
SimplifyCFG is the correct pass for this kind of transformation. I don't think it's entirely unreasonable to run it more often, but the compile time cost needs to be taken in account. I'm not necessarily sympathetic to the idea of adding another canonicalization pass only for this purpose. Side note: IMHO, at this point SimplifyCFG is doing even more than it should (including, e.g. sinking of variable from "almost empty" BBs, i.e. BBs which have only a single instruction & terminator, if I recall correctly). If anything, I'd rather split the sinking logic and other operations to a different pass, rather than moving simplifying proven unconditional branches elsewhere :) Now that LLVM has an incremental dominator API it should be more feasible to run SimplifyCFG more times without recomputing the dominator all the time. I haven't checked whether there are other expensive analyses that need to be preservedDavide Italiano at 2017-11-09 23:57:53
Also, FWIW, to clear other people's doubt. SimplifyCFG + Instcombine when run in tandem don't reach a maximal fixpoint. When running the inliner with function simplification passes pipelines, this is fairly visible (see, e.g. Chandler's talk https://www.youtube.com/watch?v=s4wnuiCwTGU for a more detailed explanation of the problem).
It's of course possible to run instcombine + simplifycfg in a loop n times while inlining and that would presumably make the code faster, at some expense for compile time (in the
std::vector<>::push_backexample in the talk, that's quantifiable by the number of entries that can be simplified, but in general it should be a good win for some cases.I guess it's entirely up to the user/downstream language to decide whether once the new pass manager will be in place it's profitable to run these passes in a loop. I think there are plans to implement this feature, but it's unlikely it will be the default (presumably, it will be guarded by a
cl::optor something like that)Davide Italiano at 2017-11-10 00:03:11
@dcci
Looking at performance stats again, I don't think adding 2 more calls to
SimplifyCfg(one between IndVarSimplify/LoopIdiomRecognize - which I think is fairly important, because people tend to get annoyed about LoopIdiomRecognize not working well - and one right after loop unrolling) would be too much of a problem.Ariel Ben-Yehuda at 2017-11-16 13:02:32
@arielb1 Fair enough. I think it might be a good addition for C++/clang as well, but I haven't gotten the time to check. Some people (TM) are trying very hard upstream to make a mess of SimplifyCFG, but there's hope they'll fail, so I think your plan is reasonable. Can you add performance/compile time numbers after your experiments? Thanks!
cc: @alexcrichton
Davide Italiano at 2017-11-16 18:37:46
Also, I haven't looked at the IR produced by Rust very closely but I'd like to point out two things:
- The current pass manager pipeline (both module and function) has been heavily optimized for C/C++. If Rust wants to propose an alternative, well, maybe we (LLVM) should take a look
- There have been many discussions about the pipeline getting older and older (and therefore, less tuned). During the ThinLTO bringup people tuned it a bit, but the Module and the LTO pipeline are not necessarily the best (e.g. we run loop vectorizer and loop unroller both in the per-TU pipeline and in the LTO pipeline). In other words, if you're interested, this is something that needs some love.
Davide Italiano at 2017-11-16 18:40:58
@dcci heh I'd definitely beleive there's some possible improvements here! AFAIK we're using the "vanilla" pass manager builders in LLVM in the sense of we're at least trying to use the exact same optimization pipeline as C++ in clang. Our codegen I know historically in at least a few places has had to be altered to look more like C++ to get better optimized, but that sort of issue comes up relatively rarely.
Alex Crichton at 2017-11-17 14:47:00
It's of course possible to run instcombine + simplifycfg in a loop n times while inlining and that would presumably make the code faster, at some expense for compile time (in the std::vector<>::push_back example in the talk, that's quantifiable by the number of entries that can be simplified, but in general it should be a good win for some cases.
Would it make sense to apply iterate-to-fixpoint optimization passes on a subset of methods, hotspots found by profiling basically. Either via annotation similar to
#[inline]or as part of PGO.the8472 at 2017-12-28 01:33:12
@the8472 Most combinations of passes (including pretty basic combinations like SimplifyCFG+InstCombine, as @dcci mentioned in the post you quoted) don't generally converge to a fixed point when run together, i.e., on some code iterate-to-fixpoint would loop infinitely.
Hanna Kruppe at 2017-12-28 10:08:23
@rkruppe ok, if it does not reach a fixpoint, does it reach fairly small cycles? In that case a cycle finding algorithm using checkpoints could be used to stop the iteration. Additionally the idea would be to run this in combination with inlining (@dcci's push_back example), so a code size constraint could also be added as abort criteria.
the8472 at 2017-12-28 18:09:05
That's a big if (and getting more unlikely as you add more passes), checkpoints will be tricky to implement and balloon compile time & peak rss even further, and the approach doesn't scale to larger pipelines.
I'm all but convinced that solving phase ordering issues completely can't be achieved by running classical LLVM passes to a fixed point, only by adopting a wholly different approach to optimization (e.g., equality saturation).
Hanna Kruppe at 2017-12-28 22:08:10
@rkruppe: It's pretty easy to show that iterating to a fixpoint is insufficient by showing that there are pairs of optimizations for which no ordering is optimal - (register allocation, instruction selection) is something of the "canonical" such pair, with the solution being to fuse the two into a single pass.
Alan Lawrence's thesis on optimizing using the Value-State Dependence Graph (VSDG) representation covers that pretty nicely, along with a variety of related issues.
In addition, because the VSDG eliminates control flow and non-causal ordering from the IR (then reintroduces it optimally during proceduralization and sequentialization), equality saturation performed on the VSDG may be more efficiently implementable (as those are entire classes of equivalences it won't need to analyze or represent).
eternaleye at 2017-12-28 22:34:16
@rkruppe yes, the compile times are an obvious concern, that's why I suggested it to only applying it to select functions, not globally. E.g. this comment on PGO mentions a bin crate having one hot code path they'd like to optimize.
I'm all but convinced that solving phase ordering issues completely can't be achieved by running classical LLVM passes to a fixed point
But one could still get an improvement over the current situation.
the8472 at 2017-12-28 22:46:46
So if I read this correctly, is this issue the reason why code using
Ord::cmpcurrently compiles to an instruction sequence that first compares the values, conditionally moves -1/0/1 into a register and then compares this again?main() at 2018-02-03 18:06:14
shall this be marked solved now? the generated code with rustc 1.25.0-nightly (16362c737 2018-02-12):
define zeroext i1 @example::exists_in_table(i32 %v) unnamed_addr #0 !dbg !12 { %0 = icmp eq i32 %v, 0, !dbg !14 ret i1 %0, !dbg !18 }see https://godbolt.org/g/B9kkeV for diff. solved by #47828
Andreas Jonson at 2018-02-14 07:41:44
Perhaps this is a different issue, but I'm still seeing
std::cmp::Orderingvalues being materialized to -1/0/1 for no reason: https://godbolt.org/g/mbQDZopub fn mytest(x: i32, y: i32, i: usize) -> usize { let x = if x == y { ::std::process::exit(0) } else if x < y { 0 } else { 1 }; 2 * (i + 1) - x }Comparing manually is fine and yields perfect code:
example::mytest: cmp edi, esi je .LBB0_1 setge al add rdx, rdx movzx eax, al sub rdx, rax add rdx, 2 mov rax, rdx ret .LBB0_1: push rbp mov rbp, rsp xor edi, edi call std::process::exit@PLT ud2Using
Ord(a zero-cost abstraction, one would think):use std::cmp::Ordering::*; pub fn mytest(x: i32, y: i32, i: usize) -> usize { let x = match x.cmp(&y) { Equal => ::std::process::exit(0), Less => 0, Greater => 1, }; 2 * (i + 1) - x }The
Orderingvalue is first materialized intorcxfor no reason, then the code branches on that:example::mytest: xor eax, eax xor ecx, ecx cmp edi, esi setge cl lea rcx, [rcx + rcx - 1] cmove rcx, rax cmp rcx, 1 je .LBB0_3 test rcx, rcx je .LBB0_2 lea rax, [rax + 2*rdx] add rax, 2 ret .LBB0_3: mov rax, -1 lea rax, [rax + 2*rdx] add rax, 2 ret .LBB0_2: push rbp mov rbp, rsp xor edi, edi call std::process::exit@PLT ud2This is especially problematic when implementing a generic data structure which has to use
Ordand can't simply fall back to an if-else chain (like https://github.com/main--/rust-eytzinger/blob/ea266a38e4c5e72aa0ab934a5bfef8e8fff3bcd8/src/lib.rs#L329-L339).main() at 2018-02-14 10:59:45
This issue is hardly actionable. We should make a tracking issue (possibly converting this to a tracking issue) and associate specific issues with the tracking issue.
I hope that answers questions posed in the two questions above.
Simonas Kazlauskas at 2018-02-14 11:49:02
Would this be the right place or has a separate issue been created regarding pass optimizations?
the8472 at 2020-11-24 17:05:20
fixed since
rustc 1.56.0: https://rust.godbolt.org/z/rEGGG4aqhedit: just the original issue. the other issue still persists.
@rustbot label +e-needs-test
Veera at 2025-02-25 00:57:46