TPDE
Loading...
Searching...
No Matches
Analyzer.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include <algorithm>
7#include <format>
8#include <ostream>
9
10#include "IRAdaptor.hpp"
11#include "tpde/ValLocalIdx.hpp"
12#include "tpde/base.hpp"
13#include "util/SmallBitSet.hpp"
14#include "util/SmallVector.hpp"
15
16namespace tpde {
17
18template <IRAdaptor Adaptor>
19struct Analyzer {
20 // some forwards for the IR type defs
21 using IRValueRef = typename Adaptor::IRValueRef;
22 using IRInstRef = typename Adaptor::IRInstRef;
23 using IRBlockRef = typename Adaptor::IRBlockRef;
24 using IRFuncRef = typename Adaptor::IRFuncRef;
25
26 static constexpr IRBlockRef INVALID_BLOCK_REF = Adaptor::INVALID_BLOCK_REF;
27
28
29 static constexpr size_t SMALL_BLOCK_NUM = 64;
30 static constexpr size_t SMALL_VALUE_NUM = 128;
31
32 /// Reference to the adaptor
33 Adaptor *adaptor;
34
35 /// An index into block_layout
36 enum class BlockIndex : u32 {
37 };
38 static constexpr BlockIndex INVALID_BLOCK_IDX = static_cast<BlockIndex>(~0u);
39
40 /// The block layout, a BlockIndex is an index into this array
41 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> block_layout = {};
42
43 /// For each BlockIndex, the corresponding loop
44 // TODO(ts): add the delayed free list in here to save on allocations?
45 util::SmallVector<u32, SMALL_BLOCK_NUM> block_loop_map = {};
46
47 struct Loop {
48 u32 level;
49 u32 parent;
50 // [begin, end[
51 BlockIndex begin = INVALID_BLOCK_IDX, end = INVALID_BLOCK_IDX;
52
53 // for building the loop tree, we accumulate the number of blocks here
54 u32 num_blocks = 0;
55 // TODO(ts): add skip_target?
56
57 u32 definitions = 0, definitions_in_childs = 0;
58 };
59
60 util::SmallVector<Loop, 16> loops = {};
61
62 // TODO(ts): move all struct definitions to the top?
63 struct LivenessInfo {
64 // [first, last]
65 BlockIndex first, last;
66 u32 ref_count;
67 u32 lowest_common_loop;
68
69 // TODO(ts): maybe outsource both these booleans to a bitset?
70 // we're wasting a lot of space here
71
72 /// The value may not be deallocated until the last block is finished
73 /// even if the reference count hits 0
74 bool last_full;
75
76 u16 epoch = 0;
77 };
78
79 util::SmallVector<LivenessInfo, SMALL_VALUE_NUM> liveness = {};
80 /// Epoch of liveness information, entries with a value not equal to this
81 /// epoch are invalid. This is an optimization to avoid clearing the entire
82 /// liveness vector for every function, which is important for functions with
83 /// many values that are ignored for the liveness analysis (e.g., var refs).
84 u16 liveness_epoch = 0;
85 u32 liveness_max_value;
86
87 u32 num_insts;
88
89 explicit Analyzer(Adaptor *adaptor) : adaptor(adaptor) {}
90
91 /// Start the compilation of a new function and build the loop tree and
92 /// liveness information. Previous information is discarded.
93 void switch_func(IRFuncRef func);
94
95 IRBlockRef block_ref(const BlockIndex idx) const noexcept {
96 assert(static_cast<u32>(idx) <= block_layout.size());
97 if (static_cast<u32>(idx) == block_layout.size()) {
98 // this might be called with next_block() which is invalid for the
99 // last block
100 return INVALID_BLOCK_REF;
101 }
102 return block_layout[static_cast<u32>(idx)];
103 }
104
105 BlockIndex block_idx(IRBlockRef block_ref) const noexcept {
106 return static_cast<BlockIndex>(adaptor->block_info(block_ref));
107 }
108
109 const LivenessInfo &liveness_info(const ValLocalIdx val_idx) const noexcept {
110 assert(static_cast<u32>(val_idx) < liveness.size());
111 assert(liveness[static_cast<u32>(val_idx)].epoch == liveness_epoch &&
112 "access to liveness of ignored value");
113 return liveness[static_cast<u32>(val_idx)];
114 }
115
116 u32 block_loop_idx(const BlockIndex idx) const noexcept {
117 return block_loop_map[static_cast<u32>(idx)];
118 }
119
120 const Loop &loop_from_idx(const u32 idx) const noexcept { return loops[idx]; }
121
122 bool block_has_multiple_incoming(const BlockIndex idx) const noexcept {
123 return block_has_multiple_incoming(block_ref(idx));
124 }
125
126 bool block_has_multiple_incoming(IRBlockRef block_ref) const noexcept {
127 return (adaptor->block_info2(block_ref) & 0b11) == 2;
128 }
129
130 bool block_has_phis(BlockIndex idx) const noexcept {
131 return block_has_phis(block_ref(idx));
132 }
133
134 bool block_has_phis(IRBlockRef block_ref) const noexcept {
135 return (adaptor->block_info2(block_ref) & 0b1'0000) != 0;
136 }
137
138 void print_rpo(std::ostream &os) const;
139 void print_block_layout(std::ostream &os) const;
140 void print_loops(std::ostream &os) const;
141 void print_liveness(std::ostream &os) const;
142
143protected:
144 // for use during liveness analysis
145 LivenessInfo &liveness_maybe(const IRValueRef val) noexcept;
146
147 void build_block_layout();
148
149 void build_loop_tree_and_block_layout(
150 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
151 const util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
152 const util::SmallBitSet<256> &loop_heads);
153
154 /// Builds a vector of block references in reverse post-order.
155 void build_rpo_block_order(
156 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &out) const noexcept;
157
158 /// Creates a bitset of loop_heads and sets the parent loop of each block.
159 /// The u32 in loop_parent is an index into block_rpo
160 void identify_loops(
161 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
162 util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
163 util::SmallBitSet<256> &loop_heads) const noexcept;
164
165 void compute_liveness() noexcept;
166};
167
168template <IRAdaptor Adaptor>
169void Analyzer<Adaptor>::switch_func([[maybe_unused]] IRFuncRef func) {
170 build_block_layout();
171 compute_liveness();
172}
173
174template <IRAdaptor Adaptor>
175void Analyzer<Adaptor>::print_rpo(std::ostream &os) const {
176 // build_rpo_block_order clobbers block data, so save and restore.
177 util::SmallVector<std::tuple<IRBlockRef, u32, u32>, SMALL_BLOCK_NUM> data;
178 for (IRBlockRef cur : adaptor->cur_blocks()) {
179 data.emplace_back(cur, adaptor->block_info(cur), adaptor->block_info2(cur));
180 }
181
182 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> rpo;
183 build_rpo_block_order(rpo);
184 for (u32 i = 0; i < rpo.size(); ++i) {
185 os << std::format(" {}: {}\n", i, adaptor->block_fmt_ref(rpo[i]));
186 }
187
188 for (const auto &[cur, val1, val2] : data) {
189 adaptor->block_set_info(cur, val1);
190 adaptor->block_set_info2(cur, val2);
191 }
192}
193
194template <IRAdaptor Adaptor>
195void Analyzer<Adaptor>::print_block_layout(std::ostream &os) const {
196 for (u32 i = 0; i < block_layout.size(); ++i) {
197 os << std::format(" {}: {}\n", i, adaptor->block_fmt_ref(block_layout[i]));
198 }
199}
200
201template <IRAdaptor Adaptor>
202void Analyzer<Adaptor>::print_loops(std::ostream &os) const {
203 for (u32 i = 0; i < loops.size(); ++i) {
204 const auto &loop = loops[i];
205 os << std::format(" {}: level {}, parent {}, {}->{}\n",
206 i,
207 loop.level,
208 loop.parent,
209 static_cast<u32>(loop.begin),
210 static_cast<u32>(loop.end));
211 }
212}
213
214template <IRAdaptor Adaptor>
215void Analyzer<Adaptor>::print_liveness(std::ostream &os) const {
216 for (u32 i = 0; i <= liveness_max_value; ++i) {
217 if (liveness[i].epoch != liveness_epoch) {
218 os << std::format(" {}: ignored\n", i);
219 continue;
220 }
221
222 const auto &info = liveness[i];
223 os << std::format(" {}: {} refs, {}->{} ({}->{}), lf: {}\n",
224 i,
225 info.ref_count,
226 static_cast<u32>(info.first),
227 static_cast<u32>(info.last),
228 adaptor->block_fmt_ref(block_ref(info.first)),
229 adaptor->block_fmt_ref(block_ref(info.last)),
230 info.last_full);
231 }
232}
233
234template <IRAdaptor Adaptor>
235typename Analyzer<Adaptor>::LivenessInfo &
236 Analyzer<Adaptor>::liveness_maybe(const IRValueRef val) noexcept {
237 const ValLocalIdx val_idx = adaptor->val_local_idx(val);
238 if constexpr (Adaptor::TPDE_PROVIDES_HIGHEST_VAL_IDX) {
239 assert(liveness.size() > static_cast<u32>(val_idx));
240 return liveness[static_cast<u32>(val_idx)];
241 } else {
242 if (liveness_max_value <= static_cast<u32>(val_idx)) {
243 liveness_max_value = static_cast<u32>(val_idx);
244 if (liveness.size() <= liveness_max_value) {
245 // TODO: better growth strategy?
246 liveness.resize(liveness_max_value + 0x100);
247 }
248 }
249 return liveness[static_cast<u32>(val_idx)];
250 }
251}
252
253template <IRAdaptor Adaptor>
254void Analyzer<Adaptor>::build_block_layout() {
255 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> block_rpo{};
256 build_rpo_block_order(block_rpo);
257
258 util::SmallVector<u32, SMALL_BLOCK_NUM> loop_parent{};
259 util::SmallBitSet<256> loop_heads{};
260
261 // TODO(ts): print out this step?
262 identify_loops(block_rpo, loop_parent, loop_heads);
263 assert(loop_parent.size() == block_rpo.size());
264 // the entry block is always the loop head for the root loop
265 loop_heads.mark_set(0);
266
267 build_loop_tree_and_block_layout(block_rpo, loop_parent, loop_heads);
268}
269
270template <IRAdaptor Adaptor>
271void Analyzer<Adaptor>::build_loop_tree_and_block_layout(
272 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
273 const util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
274 const util::SmallBitSet<256> &loop_heads) {
275 // TODO(ts): maybe merge this into the block_rpo?
276 struct BlockLoopInfo {
277 u32 loop_idx;
278 u32 rpo_idx;
279 };
280
281 util::SmallVector<BlockLoopInfo, SMALL_BLOCK_NUM> loop_blocks;
282 loop_blocks.resize_uninitialized(block_rpo.size());
283 for (u32 i = 0; i < block_rpo.size(); ++i) {
284 loop_blocks[i] = BlockLoopInfo{~0u, i};
285 }
286
287 // if we have not seen the parent loop before, we need to recursively insert
288 // them.
289 // the recursive call only happens with irreducible control-flow
290 const auto build_or_get_parent_loop = [&](const u32 i,
291 const auto &self) -> u32 {
292 const auto parent = loop_parent[i];
293 if (loop_blocks[parent].loop_idx != ~0u) {
294 // we have already seen that block and given it a loop
295 return loop_blocks[parent].loop_idx;
296 } else {
297 // get the parent loop and build the loop for this block
298 const auto parent_loop_idx = self(parent, self);
299 const auto loop_idx = loops.size();
300 loops.push_back(Loop{.level = loops[parent_loop_idx].level + 1,
301 .parent = parent_loop_idx});
302 loop_blocks[parent].loop_idx = loop_idx;
303 return loop_idx;
304 }
305 };
306
307 // entry is always the head of the top-level loop
308 loops.clear();
309 loops.push_back(Loop{.level = 0, .parent = 0, .num_blocks = 1});
310 loop_blocks[0].loop_idx = 0;
311
312 for (u32 i = 1; i < loop_parent.size(); ++i) {
313 const u32 parent_loop =
314 build_or_get_parent_loop(i, build_or_get_parent_loop);
315
316 if (loop_heads.is_set(i)) {
317 // if the loop is irreducible, we might have already inserted it so
318 // check for that.
319 //
320 // NOTE: we could also get away with unsetting loop_heads for that
321 // loop if it is irreducible if we would not count the loop_head in
322 // its own loop's num_blocks but in its parent
323 auto loop_idx = loop_blocks[i].loop_idx;
324 if (loop_idx == ~0u) [[likely]] {
325 loop_idx = loops.size();
326 loops.push_back(
327 Loop{.level = loops[parent_loop].level + 1, .parent = parent_loop});
328 loop_blocks[i].loop_idx = loop_idx;
329 }
330 ++loops[loop_idx].num_blocks;
331 } else {
332 loop_blocks[i].loop_idx = parent_loop;
333 ++loops[parent_loop].num_blocks;
334 }
335 }
336
337 // acummulate the total number of blocks in a loop by iterating over them in
338 // reverse-order. this works since we always push the parents first
339 // leave out the first loop since it's its own parent
340 for (u32 i = loops.size() - 1; i > 0; --i) {
341 const auto &loop = loops[i];
342 loops[loop.parent].num_blocks += loop.num_blocks;
343 }
344
345 assert(loops[0].num_blocks == block_rpo.size());
346
347 // now layout the blocks by iterating in RPO and either place them at the
348 // current offset of the parent loop or, if they are a new loop, place the
349 // whole loop at the offset of the parent. this will ensure that blocks
350 // surrounded by loops will also be layouted that way in the final order.
351 //
352 // however, the way this is implemented causes the loop index to not
353 // correspond 1:1 with the final layout, though they will still be tightly
354 // packed and only the order inside a loop may change. note(ts): this could
355 // be mitigated with another pass i think.
356 //
357 // NB: we don't clear block_layout/block_loop_map, all entries are overwritten
358 block_layout.resize_uninitialized(block_rpo.size());
359 // TODO(ts): merge this with block_layout for less malloc calls?
360 // however, we will mostly need this in the liveness computation so it may
361 // be better for cache utilization to keep them separate
362 block_loop_map.resize_uninitialized(block_rpo.size());
363
364 loops[0].begin = loops[0].end = static_cast<BlockIndex>(0);
365
366 const auto layout_loop = [&](const u32 loop_idx, const auto &self) -> void {
367 assert(loops[loop_idx].begin == INVALID_BLOCK_IDX);
368 const auto parent = loops[loop_idx].parent;
369 if (loops[parent].begin == INVALID_BLOCK_IDX) {
370 // should only happen with irreducible control-flow
371 self(parent, self);
372 }
373
374 const auto loop_begin = loops[parent].end;
375 loops[parent].end = static_cast<BlockIndex>(static_cast<u32>(loop_begin) +
376 loops[loop_idx].num_blocks);
377 assert(static_cast<u32>(loops[parent].end) -
378 static_cast<u32>(loops[parent].begin) <=
379 loops[parent].num_blocks);
380
381 loops[loop_idx].begin = loops[loop_idx].end = loop_begin;
382 };
383
384 for (u32 i = 0u; i < block_rpo.size(); ++i) {
385 const auto loop_idx = loop_blocks[i].loop_idx;
386 if (loops[loop_idx].begin == INVALID_BLOCK_IDX) {
387 layout_loop(loop_idx, layout_loop);
388 }
389
390 const auto block_ref = block_rpo[loop_blocks[i].rpo_idx];
391 const auto block_idx = static_cast<u32>(loops[loop_idx].end);
392 loops[loop_idx].end = static_cast<BlockIndex>(block_idx + 1);
393
394 block_layout[block_idx] = block_ref;
395 block_loop_map[block_idx] = loop_idx;
396 adaptor->block_set_info(block_ref, block_idx);
397 }
398
399 assert(static_cast<u32>(loops[0].end) == block_rpo.size());
400
401 // TODO(ts): this is currently disabled as it wants to enfore that loop
402 // childs directly follow their parent which the algorithm above does not
403 // guarantee. But I don't think any code actually relies on this being true,
404 // just that childs have to follow their parent
405#if defined(TPDE_ASSERTS) && 0
406 struct Constraint {
407 u32 begin, end;
408 u32 index;
409 u32 level;
410 };
411
412 util::SmallVector<Constraint, 16> constraint_stack{};
413 constraint_stack.push_back(
414 Constraint{0, static_cast<u32>(loops[0].end), 0, 0});
415 for (u32 i = 1; i < loops.size(); ++i) {
416 const auto &loop = loops[i];
417 assert(!constraint_stack.empty());
418 while (static_cast<u32>(loop.begin) >= constraint_stack.back().end) {
419 constraint_stack.pop_back();
420 }
421
422 const auto &con = constraint_stack.back();
423 assert(static_cast<u32>(loop.end) - static_cast<u32>(loop.begin) ==
424 loop.num_blocks);
425 assert(static_cast<u32>(loop.begin) >= con.begin);
426 assert(static_cast<u32>(loop.end) <= con.end);
427 assert(loop.parent == con.index);
428 assert(loop.level == con.level + 1);
429
430 constraint_stack.push_back(Constraint{static_cast<u32>(loop.begin),
431 static_cast<u32>(loop.end),
432 i,
433 loop.level});
434 }
435
436#endif
437}
438
439template <IRAdaptor Adaptor>
440void Analyzer<Adaptor>::build_rpo_block_order(
441 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &out) const noexcept {
442 out.clear();
443
444 u32 num_blocks = 0;
445 {
446 // Initialize the block info
447 u32 idx = 0;
448 for (IRBlockRef cur : adaptor->cur_blocks()) {
449 adaptor->block_set_info(cur, idx);
450 adaptor->block_set_info2(cur, 0);
451 ++idx;
452 }
453 num_blocks = idx;
454 }
455 out.resize_uninitialized(num_blocks);
456
457 // implement the RPO generation using a simple stack that also walks in
458 // post-order and then reverse at the end. However, consider the following
459 // CFG
460 //
461 // A:
462 // - B:
463 // - D
464 // - E
465 // - C:
466 // - F
467 // - G
468 //
469 // which has valid RPOs A B D E C F G, A B D E C G F, A B E D C F G, ...
470 // which is not very nice for loops since in many IRs the order of the block
471 // has some meaning for the layout so we sort the pushed children by the
472 // order in the block list
473 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM / 2> stack;
474 stack.push_back(adaptor->cur_entry_block());
475
476 // NOTE(ts): because we process children in reverse order
477 // this gives a bit funky results with irreducible loops
478 // but it should be fine I think
479 auto rpo_idx = num_blocks - 1;
480 while (!stack.empty()) {
481 const auto cur_node = stack.back();
482
483 // have we already added the block to the RPO list?
484 // if so we just skip it
485 if (adaptor->block_info2(cur_node) & 0b1000) {
486 stack.pop_back();
487 continue;
488 }
489
490 // have we already pushed the children of the nodes to the stack and
491 // processed them? if so we can add the block to the post-order,
492 // otherwise add the children and wait for them to be processed
493 if (adaptor->block_info2(cur_node) & 0b100) {
494 stack.pop_back();
495 // set the blocks RPO index and push it to the RPO list
496 adaptor->block_set_info(cur_node, rpo_idx);
497 // mark the block as already being on the RPO list
498 adaptor->block_set_info2(cur_node,
499 adaptor->block_info2(cur_node) | 0b1000);
500 out[rpo_idx] = cur_node;
501 --rpo_idx;
502 continue;
503 }
504
505 // mark as visited and add the successors
506 adaptor->block_set_info2(cur_node, adaptor->block_info2(cur_node) | 0b100);
507
508 const auto start_idx = stack.size();
509 // push the successors onto the stack to visit them
510 for (const auto succ : adaptor->block_succs(cur_node)) {
511 assert(succ != adaptor->cur_entry_block());
512 const u32 info = adaptor->block_info2(succ);
513 if ((info & 0b11) != 0) {
514 if ((info & 0b11) == 1) {
515 // note that succ has more than one incoming edge
516 adaptor->block_set_info2(succ, (info & ~0b11) | 2);
517 }
518 } else {
519 // TODO(ts): if we just do a post-order traversal and reverse
520 // everything at the end we could use only block_info to check
521 // whether we already saw a block. And I forgot why?
522 adaptor->block_set_info2(succ, info | 1);
523 }
524
525 // if the successor is already on the rpo list or it has been
526 // already visited and children added
527 if (adaptor->block_info2(succ) & 0b1100) {
528 continue;
529 }
530
531 stack.push_back(succ);
532 }
533
534 // Order the pushed children by their original block
535 // index since the children get visited in reverse and then inserted
536 // in reverse order in the rpo list @_@
537 const auto len = stack.size() - start_idx;
538 if (len <= 1) {
539 continue;
540 }
541
542 if (len == 2) {
543 if (adaptor->block_info(stack[start_idx]) >
544 adaptor->block_info(stack[start_idx + 1])) {
545 std::swap(stack[start_idx], stack[start_idx + 1]);
546 }
547 continue;
548 }
549
550 std::sort(stack.begin() + start_idx,
551 stack.end(),
552 [this](const IRBlockRef lhs, const IRBlockRef rhs) {
553 // note(ts): this may have not so nice performance
554 // characteristics if the block lookup is a hashmap so
555 // maybe cache this for larger lists?
556 return adaptor->block_info(lhs) < adaptor->block_info(rhs);
557 });
558 }
559
560 if (rpo_idx != 0xFFFF'FFFF) {
561 // there are unreachable blocks
562 // so we did not fill up the whole array and need to shift it
563 // TODO(ts): benchmark this against filling up a vector and always
564 // reversing it tho it should be better i think
565 out.erase(out.begin(), out.begin() + 1 + rpo_idx);
566
567 // need to fixup the RPO index for blocks as well :/
568 for (auto i = 0u; i < out.size(); ++i) {
569 adaptor->block_set_info(out[i], i);
570 }
571
572#ifndef NDEBUG
573 // In debug builds, reset block index of unreachable blocks.
574 for (IRBlockRef cur : adaptor->cur_blocks()) {
575 if (adaptor->block_info2(cur) == 0) {
576 adaptor->block_set_info(cur, 0xFFFF'FFFF);
577 }
578 }
579#endif
580 }
581
582#ifdef TPDE_LOGGING
583 TPDE_LOG_TRACE("Finished building RPO for blocks:");
584 for (u32 i = 0; i < out.size(); ++i) {
585 TPDE_LOG_TRACE("Index {}: {}", i, adaptor->block_fmt_ref(out[i]));
586 }
587#endif
588}
589
590template <IRAdaptor Adaptor>
591void Analyzer<Adaptor>::identify_loops(
592 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
593 util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
594 util::SmallBitSet<256> &loop_heads) const noexcept {
595 loop_parent.clear();
596 loop_parent.resize(block_rpo.size());
597 loop_heads.clear();
598 loop_heads.resize(block_rpo.size());
599
600 // Implement the modified algorithm from Wei et al.: A New Algorithm for
601 // Identifying Loops in Decompilation
602 // in a non-recursive form
603
604 // TODO(ts): ask the adaptor if it can store this information for us?
605 // then we could save the allocation here
606 struct BlockInfo {
607 bool traversed;
608 bool self_loop;
609 u32 dfsp_pos;
610 u32 iloop_header;
611 };
612
613 util::SmallVector<BlockInfo, SMALL_BLOCK_NUM> block_infos;
614 block_infos.resize(block_rpo.size());
615
616 struct TravState {
617 u32 block_idx; // b0 from the paper
618 u32 dfsp_pos;
619 u32 nh;
620 u8 state = 0;
621 decltype(adaptor->block_succs(adaptor->cur_entry_block()).begin()) succ_it;
622 decltype(adaptor->block_succs(adaptor->cur_entry_block()).end()) end_it;
623 };
624
625 // The algorithm will reach the depth of the CFG so the small vector needs
626 // to be relatively big
627 // TODO(ts); maybe use the recursive form for small CFGs since that may be
628 // faster or use stack switching since the non-recursive version is really
629 // ugly
630 util::SmallVector<TravState, SMALL_BLOCK_NUM> trav_state;
631
632 trav_state.push_back(
633 TravState{.block_idx = 0,
634 .dfsp_pos = 1,
635 .succ_it = adaptor->block_succs(block_rpo[0]).begin(),
636 .end_it = adaptor->block_succs(block_rpo[0]).end()});
637
638 const auto tag_lhead = [&block_infos](u32 b, u32 h) {
639 if (b == h || h == 0) {
640 return;
641 }
642
643 auto cur1 = b, cur2 = h;
644 while (block_infos[cur1].iloop_header != 0) {
645 const auto ih = block_infos[cur1].iloop_header;
646 if (ih == cur2) {
647 return;
648 }
649 if (block_infos[ih].dfsp_pos < block_infos[cur2].dfsp_pos) {
650 block_infos[cur1].iloop_header = cur2;
651 cur1 = cur2;
652 cur2 = ih;
653 } else {
654 cur1 = ih;
655 }
656 }
657 block_infos[cur1].iloop_header = cur2;
658 };
659
660 // TODO(ts): this can be optimized and more condensed so that the state
661 // variable becomes unnecessary
662 while (!trav_state.empty()) {
663 auto &state = trav_state.back();
664 const auto block_idx = state.block_idx;
665 switch (state.state) {
666 case 0: {
667 // entry
668 block_infos[block_idx].traversed = true;
669 block_infos[block_idx].dfsp_pos = state.dfsp_pos;
670
671 // TODO(ts): somehow make sure that the iterators can live when
672 // succs is destroyed?
673 // auto succs = adaptor->block_succs(block_rpo[block_idx]);
674 // state.succ_it = succs.begin();
675 // state.end_it = succs.end();
676 state.state = 1;
677 }
678 [[fallthrough]];
679 case 1: {
680 loop_inner:
681 auto cont = false;
682 while (state.succ_it != state.end_it) {
683 const auto succ_idx = adaptor->block_info(*state.succ_it);
684 if (succ_idx == block_idx) {
685 block_infos[block_idx].self_loop = true;
686 }
687
688 if (block_infos[succ_idx].traversed) {
689 if (block_infos[succ_idx].dfsp_pos > 0) {
690 tag_lhead(block_idx, succ_idx);
691 } else if (block_infos[succ_idx].iloop_header != 0) {
692 auto h_idx = block_infos[succ_idx].iloop_header;
693 if (block_infos[h_idx].dfsp_pos > 0) {
694 tag_lhead(block_idx, h_idx);
695 } else {
696 while (block_infos[h_idx].iloop_header != 0) {
697 h_idx = block_infos[h_idx].iloop_header;
698 if (block_infos[h_idx].dfsp_pos > 0) {
699 tag_lhead(block_idx, h_idx);
700 break;
701 }
702 }
703 }
704 }
705 ++state.succ_it;
706 continue;
707 }
708
709 // recurse
710 cont = true;
711 state.state = 2;
712 // TODO(ts): somehow make sure that the iterators can live when
713 // succs is destroyed?
714 auto succs = adaptor->block_succs(block_rpo[succ_idx]);
715 trav_state.push_back(TravState{.block_idx = succ_idx,
716 .dfsp_pos = state.dfsp_pos + 1,
717 .succ_it = succs.begin(),
718 .end_it = succs.end()});
719 break;
720 }
721
722 if (cont) {
723 continue;
724 }
725
726 block_infos[block_idx].dfsp_pos = 0;
727 trav_state.pop_back();
728 if (trav_state.empty()) {
729 break;
730 }
731 trav_state.back().nh = block_infos[block_idx].iloop_header;
732 continue;
733 }
734 case 2: {
735 const auto nh = state.nh;
736 tag_lhead(block_idx, nh);
737
738 ++state.succ_it;
739 state.state = 1;
740 goto loop_inner;
741 }
742 }
743 }
744
745 for (u32 i = 0; i < block_rpo.size(); ++i) {
746 auto &info = block_infos[i];
747 if (info.iloop_header != 0) {
748 loop_parent[i] = info.iloop_header;
749 loop_heads.mark_set(info.iloop_header);
750 }
751 if (info.self_loop) {
752 loop_heads.mark_set(i);
753 }
754 }
755}
756
757template <IRAdaptor Adaptor>
758void Analyzer<Adaptor>::compute_liveness() noexcept {
759 // implement the liveness algorithm described in
760 // http://databasearchitects.blogspot.com/2020/04/linear-time-liveness-analysis.html
761 // and Kohn et al.: Adaptive Execution of Compiled Queries
762 // TODO(ts): also expose the two-pass liveness algo as an option
763
764 TPDE_LOG_TRACE("Starting Liveness Analysis");
765
766 // Bump epoch. On overflow, we must clear all liveness info entries.
767 if (++liveness_epoch == 0) {
768 liveness.clear();
769 liveness_epoch = 1;
770 }
771
772 if constexpr (Adaptor::TPDE_PROVIDES_HIGHEST_VAL_IDX) {
773 liveness_max_value = adaptor->cur_highest_val_idx();
774 if (liveness_max_value >= liveness.size()) {
775 liveness.resize(liveness_max_value + 1);
776 }
777 } else {
778 liveness_max_value = 0;
779 }
780
781 num_insts = 0;
782
783 const auto visit = [this](const IRValueRef value, const u32 block_idx) {
784 TPDE_LOG_TRACE(" Visiting value {} in block {}",
785 adaptor->value_fmt_ref(value),
786 block_idx);
787 if (adaptor->val_ignore_in_liveness_analysis(value)) {
788 TPDE_LOG_TRACE(" value is ignored");
789 return;
790 }
791
792 auto &liveness = liveness_maybe(value);
793 if (liveness.epoch != liveness_epoch) {
794 TPDE_LOG_TRACE(" initializing liveness info, lcl is {}",
795 block_loop_map[block_idx]);
796 liveness = LivenessInfo{
797 .first = static_cast<BlockIndex>(block_idx),
798 .last = static_cast<BlockIndex>(block_idx),
799 .ref_count = 1,
800 .lowest_common_loop = block_loop_map[block_idx],
801 .last_full = false,
802 .epoch = liveness_epoch,
803 };
804 return;
805 }
806
807 assert(liveness.ref_count != ~0u && "used value without definition");
808
809 ++liveness.ref_count;
810 TPDE_LOG_TRACE(" increasing ref_count to {}", liveness.ref_count);
811
812 // helpers
813 const auto update_for_block_only = [&liveness, block_idx]() {
814 const auto old_first = static_cast<u32>(liveness.first);
815 const auto old_last = static_cast<u32>(liveness.last);
816
817 const auto new_first = std::min(old_first, block_idx);
818 const auto new_last = std::max(old_last, block_idx);
819 liveness.first = static_cast<BlockIndex>(new_first);
820 liveness.last = static_cast<BlockIndex>(new_last);
821
822 // if last changed, we don't need to extend the lifetime to the end
823 // of the last block
824 liveness.last_full = (old_last == new_last) ? liveness.last_full : false;
825 };
826
827 const auto update_for_loop = [&liveness](const Loop &loop) {
828 const auto old_first = static_cast<u32>(liveness.first);
829 const auto old_last = static_cast<u32>(liveness.last);
830
831 const auto new_first = std::min(old_first, static_cast<u32>(loop.begin));
832 const auto new_last = std::max(old_last, static_cast<u32>(loop.end) - 1);
833 liveness.first = static_cast<BlockIndex>(new_first);
834 liveness.last = static_cast<BlockIndex>(new_last);
835
836 // if last changed, set last_full to true
837 // since the values need to be allocated when the loop is active
838 liveness.last_full = (old_last == new_last) ? liveness.last_full : true;
839 };
840
841 const auto block_loop_idx = block_loop_map[block_idx];
842 if (liveness.lowest_common_loop == block_loop_idx) {
843 // just extend the liveness interval
844 TPDE_LOG_TRACE(" lcl is same as block loop");
845 update_for_block_only();
846
847 TPDE_LOG_TRACE(" new interval {}->{}, lf: {}",
848 static_cast<u32>(liveness.first),
849 static_cast<u32>(liveness.last),
850 liveness.last_full);
851 return;
852 }
853
854 const Loop &liveness_loop = loops[liveness.lowest_common_loop];
855 const Loop &block_loop = loops[block_loop_idx];
856
857 if (liveness_loop.level < block_loop.level &&
858 static_cast<u32>(block_loop.begin) <
859 static_cast<u32>(liveness_loop.end)) {
860 assert(static_cast<u32>(block_loop.end) <=
861 static_cast<u32>(liveness_loop.end));
862
863 TPDE_LOG_TRACE(" block_loop {} is nested inside lcl", block_loop_idx);
864 // The current use is nested inside the loop of the liveness
865 // interval so we only need to get the loop at level
866 // (liveness_loop.level + 1) that contains block_loop and extend the
867 // liveness interval
868 const auto target_level = liveness_loop.level + 1;
869 auto cur_loop_idx = block_loop_idx;
870 auto cur_level = block_loop.level;
871 // TODO(ts): we could also skip some loops here since we know that
872 // there have to be at least n=(cur_level-target_level) loops
873 // between cur_loop and target_loop
874 // so we could choose cur_loop_idx = min(cur_parent, cur_idx-n)?
875 // however this might jump into more nested loops
876 // so maybe check (cur_idx-n).level first?
877 while (cur_level != target_level) {
878 cur_loop_idx = loops[cur_loop_idx].parent;
879 --cur_level;
880 }
881 assert(loops[cur_loop_idx].level == target_level);
882 TPDE_LOG_TRACE(" target_loop is {}", cur_loop_idx);
883 update_for_loop(loops[cur_loop_idx]);
884
885 TPDE_LOG_TRACE(" new interval {}->{}, lf: {}",
886 static_cast<u32>(liveness.first),
887 static_cast<u32>(liveness.last),
888 liveness.last_full);
889 return;
890 }
891
892 TPDE_LOG_TRACE(" block_loop {} is nested higher or in a different loop",
893 block_loop_idx);
894 // need to update the lowest common loop to contain both liveness_loop
895 // and block_loop and then extend the interval accordingly
896
897 // TODO(ts): this algorithm is currently worst-case O(n) which makes the
898 // whole liveness analysis worst-case O(n^2) which is not good. However,
899 // we expect the number of loops to be much smaller than the number of
900 // values so it should be fine for most programs. To be safe, we should
901 // implement the algorithm from "Gusfield: Constant-Time Lowest Common
902 // Ancestor Retrieval" which seems to be the one with the most
903 // reasonable overhead. It is however not zero and the queries are also
904 // not exactly free so we should definitely benchmark both version first
905 // with both small and large programs
906
907 auto lhs_idx = liveness.lowest_common_loop;
908 auto rhs_idx = block_loop_idx;
909 auto prev_rhs = rhs_idx;
910 auto prev_lhs = lhs_idx;
911 while (lhs_idx != rhs_idx) {
912 const auto lhs_level = loops[lhs_idx].level;
913 const auto rhs_level = loops[rhs_idx].level;
914 if (lhs_level > rhs_level) {
915 prev_lhs = lhs_idx;
916 lhs_idx = loops[lhs_idx].parent;
917 } else if (lhs_level < rhs_level) {
918 prev_rhs = rhs_idx;
919 rhs_idx = loops[rhs_idx].parent;
920 } else {
921 prev_lhs = lhs_idx;
922 prev_rhs = rhs_idx;
923 lhs_idx = loops[lhs_idx].parent;
924 rhs_idx = loops[rhs_idx].parent;
925 }
926 }
927
928 assert(static_cast<u32>(loops[lhs_idx].begin) <=
929 static_cast<u32>(liveness_loop.begin));
930 assert(static_cast<u32>(loops[lhs_idx].end) >=
931 static_cast<u32>(liveness_loop.end));
932 TPDE_LOG_TRACE(" new lcl is {}", lhs_idx);
933
934 liveness.lowest_common_loop = lhs_idx;
935
936 // extend for the full loop that contains liveness_loop and is nested
937 // directly in lcl
938 assert(loops[prev_lhs].parent == lhs_idx);
939 assert(static_cast<u32>(loops[prev_lhs].begin) <=
940 static_cast<u32>(liveness_loop.begin));
941 assert(static_cast<u32>(loops[prev_lhs].end) >=
942 static_cast<u32>(liveness_loop.end));
943 update_for_loop(loops[prev_lhs]);
944
945 // extend by block if the block_loop is the lcl
946 // or by prev_rhs (the loop containing block_loop nested directly in
947 // lcl) otherwise
948 if (lhs_idx == block_loop_idx) {
949 update_for_block_only();
950 } else {
951 assert(loops[prev_rhs].parent == lhs_idx);
952 assert(loops[prev_rhs].level == loops[lhs_idx].level + 1);
953 update_for_loop(loops[prev_rhs]);
954 }
955
956 TPDE_LOG_TRACE(" new interval {}->{}, lf: {}",
957 static_cast<u32>(liveness.first),
958 static_cast<u32>(liveness.last),
959 liveness.last_full);
960 };
961
962 assert(block_layout[0] == adaptor->cur_entry_block());
963 if constexpr (Adaptor::TPDE_LIVENESS_VISIT_ARGS) {
964 for (const IRValueRef arg : adaptor->cur_args()) {
965 visit(arg, 0);
966 }
967 }
968
969 for (u32 block_idx = 0; block_idx < block_layout.size(); ++block_idx) {
970 IRBlockRef block = block_layout[block_idx];
971 TPDE_LOG_TRACE(
972 "Analyzing block {} ('{}')", block_idx, adaptor->block_fmt_ref(block));
973 const auto block_loop_idx = block_loop_map[block_idx];
974
975 bool has_phis = false;
976 for (const IRValueRef phi : adaptor->block_phis(block)) {
977 TPDE_LOG_TRACE("Analyzing phi {}", adaptor->value_fmt_ref(phi));
978 has_phis = true;
979
980 const auto phi_ref = adaptor->val_as_phi(phi);
981 const u32 slot_count = phi_ref.incoming_count();
982 for (u32 i = 0; i < slot_count; ++i) {
983 const IRBlockRef incoming_block = phi_ref.incoming_block_for_slot(i);
984 const IRValueRef incoming_value = phi_ref.incoming_val_for_slot(i);
985 if (adaptor->block_info2(incoming_block) == 0) {
986 TPDE_LOG_TRACE("ignoring phi input from unreachable pred ({})",
987 adaptor->block_fmt_ref(incoming_block));
988 continue;
989 }
990 const auto incoming_block_idx = adaptor->block_info(incoming_block);
991
992 // mark the incoming value as used in the incoming block
993 visit(incoming_value, incoming_block_idx);
994 // mark the PHI-value as used in the incoming block
995 visit(phi, incoming_block_idx);
996 }
997 }
998
999 if (has_phis) {
1000 adaptor->block_set_info2(block, adaptor->block_info2(block) | 0b1'0000);
1001 }
1002
1003 for (const IRInstRef inst : adaptor->block_insts(block)) {
1004 TPDE_LOG_TRACE("Analyzing instruction {}", adaptor->inst_fmt_ref(inst));
1005 for (const IRValueRef res : adaptor->inst_results(inst)) {
1006 // mark the value as used in the current block
1007 visit(res, block_idx);
1008 ++loops[block_loop_idx].definitions;
1009 }
1010
1011 for (const IRValueRef operand : adaptor->inst_operands(inst)) {
1012 visit(operand, block_idx);
1013 }
1014
1015 num_insts += 1;
1016 }
1017 }
1018
1019 // fill out the definitions_in_childs counters
1020 // (skip 0 since it has itself as a parent)
1021 for (u32 idx = loops.size() - 1; idx != 0; --idx) {
1022 auto &loop = loops[idx];
1023 loops[loop.parent].definitions_in_childs +=
1024 loop.definitions_in_childs + loop.definitions;
1025 }
1026
1027#ifdef TPDE_ASSERTS
1028 // reset the incorrect ref_counts in the liveness infos
1029 for (auto &entry : liveness) {
1030 if (entry.ref_count == ~0u) {
1031 entry.ref_count = 0;
1032 }
1033 }
1034#endif
1035
1036 TPDE_LOG_TRACE("Finished Liveness Analysis");
1037}
1038
1039} // namespace tpde