TPDE
Loading...
Searching...
No Matches
Analyzer.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include <algorithm>
7#include <format>
8#include <ostream>
9
10#include "IRAdaptor.hpp"
11#include "tpde/ValLocalIdx.hpp"
12#include "tpde/base.hpp"
13#include "util/SmallBitSet.hpp"
14#include "util/SmallVector.hpp"
15
16namespace tpde {
17
18template <IRAdaptor Adaptor>
19struct Analyzer {
20 // some forwards for the IR type defs
21 using IRValueRef = typename Adaptor::IRValueRef;
22 using IRInstRef = typename Adaptor::IRInstRef;
23 using IRBlockRef = typename Adaptor::IRBlockRef;
24 using IRFuncRef = typename Adaptor::IRFuncRef;
25
26 static constexpr IRBlockRef INVALID_BLOCK_REF = Adaptor::INVALID_BLOCK_REF;
27
28
29 static constexpr size_t SMALL_BLOCK_NUM = 64;
30 static constexpr size_t SMALL_VALUE_NUM = 128;
31
32 /// Reference to the adaptor
33 Adaptor *adaptor;
34
35 /// An index into block_layout
36 enum class BlockIndex : u32 {
37 };
38 static constexpr BlockIndex INVALID_BLOCK_IDX = static_cast<BlockIndex>(~0u);
39
40 /// The block layout, a BlockIndex is an index into this array
41 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> block_layout = {};
42
43 /// For each BlockIndex, the corresponding loop
44 // TODO(ts): add the delayed free list in here to save on allocations?
45 util::SmallVector<u32, SMALL_BLOCK_NUM> block_loop_map = {};
46
47 struct Loop {
48 u32 level;
49 u32 parent;
50 // [begin, end[
51 BlockIndex begin = INVALID_BLOCK_IDX, end = INVALID_BLOCK_IDX;
52
53 // for building the loop tree, we accumulate the number of blocks here
54 u32 num_blocks = 0;
55 // TODO(ts): add skip_target?
56
57 u32 definitions = 0, definitions_in_childs = 0;
58 };
59
60 util::SmallVector<Loop, 16> loops = {};
61
62 // TODO(ts): move all struct definitions to the top?
63 struct LivenessInfo {
64 // [first, last]
65 BlockIndex first, last;
66 u32 ref_count;
67 u32 lowest_common_loop;
68
69 // TODO(ts): maybe outsource both these booleans to a bitset?
70 // we're wasting a lot of space here
71
72 /// The value may not be deallocated until the last block is finished
73 /// even if the reference count hits 0
74 bool last_full;
75
76 u16 epoch = 0;
77 };
78
79 util::SmallVector<LivenessInfo, SMALL_VALUE_NUM> liveness = {};
80 /// Epoch of liveness information, entries with a value not equal to this
81 /// epoch are invalid. This is an optimization to avoid clearing the entire
82 /// liveness vector for every function, which is important for functions with
83 /// many values that are ignored for the liveness analysis (e.g., var refs).
84 u16 liveness_epoch = 0;
85 u32 liveness_max_value;
86
87 u32 num_insts;
88
89 explicit Analyzer(Adaptor *adaptor) : adaptor(adaptor) {}
90
91 /// Start the compilation of a new function and build the loop tree and
92 /// liveness information. Previous information is discarded.
93 void switch_func(IRFuncRef func);
94
95 IRBlockRef block_ref(const BlockIndex idx) const noexcept {
96 assert(static_cast<u32>(idx) <= block_layout.size());
97 if (static_cast<u32>(idx) == block_layout.size()) {
98 // this might be called with next_block() which is invalid for the
99 // last block
100 return INVALID_BLOCK_REF;
101 }
102 return block_layout[static_cast<u32>(idx)];
103 }
104
105 BlockIndex block_idx(IRBlockRef block_ref) const noexcept {
106 return static_cast<BlockIndex>(adaptor->block_info(block_ref));
107 }
108
109 const LivenessInfo &liveness_info(const ValLocalIdx val_idx) const noexcept {
110 assert(static_cast<u32>(val_idx) < liveness.size());
111 assert(liveness[static_cast<u32>(val_idx)].epoch == liveness_epoch &&
112 "access to liveness of ignored value");
113 return liveness[static_cast<u32>(val_idx)];
114 }
115
116 u32 block_loop_idx(const BlockIndex idx) const noexcept {
117 return block_loop_map[static_cast<u32>(idx)];
118 }
119
120 const Loop &loop_from_idx(const u32 idx) const noexcept { return loops[idx]; }
121
122 bool block_has_multiple_incoming(const BlockIndex idx) const noexcept {
123 return block_has_multiple_incoming(block_ref(idx));
124 }
125
126 bool block_has_multiple_incoming(IRBlockRef block_ref) const noexcept {
127 return (adaptor->block_info2(block_ref) & 0b11) == 2;
128 }
129
130 bool block_has_phis(BlockIndex idx) const noexcept {
131 return block_has_phis(block_ref(idx));
132 }
133
134 bool block_has_phis(IRBlockRef block_ref) const noexcept {
135 return (adaptor->block_info2(block_ref) & 0b1'0000) != 0;
136 }
137
138 void print_rpo(std::ostream &os) const;
139 void print_block_layout(std::ostream &os) const;
140 void print_loops(std::ostream &os) const;
141 void print_liveness(std::ostream &os) const;
142
143protected:
144 // for use during liveness analysis
145 LivenessInfo &liveness_maybe(const IRValueRef val) noexcept;
146
147 void build_block_layout();
148
149 void build_loop_tree_and_block_layout(
150 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
151 const util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
152 const util::SmallBitSet<256> &loop_heads);
153
154 /// Builds a vector of block references in reverse post-order.
155 void build_rpo_block_order(
156 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &out) const noexcept;
157
158 /// Creates a bitset of loop_heads and sets the parent loop of each block.
159 /// The u32 in loop_parent is an index into block_rpo
160 void identify_loops(
161 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
162 util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
163 util::SmallBitSet<256> &loop_heads) const noexcept;
164
165 void compute_liveness() noexcept;
166};
167
168template <IRAdaptor Adaptor>
169void Analyzer<Adaptor>::switch_func([[maybe_unused]] IRFuncRef func) {
170 build_block_layout();
171 compute_liveness();
172}
173
174template <IRAdaptor Adaptor>
175void Analyzer<Adaptor>::print_rpo(std::ostream &os) const {
176 // build_rpo_block_order clobbers block data, so save and restore.
177 util::SmallVector<std::tuple<IRBlockRef, u32, u32>, SMALL_BLOCK_NUM> data;
178 for (IRBlockRef cur : adaptor->cur_blocks()) {
179 data.emplace_back(cur, adaptor->block_info(cur), adaptor->block_info2(cur));
180 }
181
182 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> rpo;
183 build_rpo_block_order(rpo);
184 for (u32 i = 0; i < rpo.size(); ++i) {
185 os << std::format(" {}: {}\n", i, adaptor->block_fmt_ref(rpo[i]));
186 }
187
188 for (const auto &[cur, val1, val2] : data) {
189 adaptor->block_set_info(cur, val1);
190 adaptor->block_set_info2(cur, val2);
191 }
192}
193
194template <IRAdaptor Adaptor>
195void Analyzer<Adaptor>::print_block_layout(std::ostream &os) const {
196 for (u32 i = 0; i < block_layout.size(); ++i) {
197 os << std::format(" {}: {}\n", i, adaptor->block_fmt_ref(block_layout[i]));
198 }
199}
200
201template <IRAdaptor Adaptor>
202void Analyzer<Adaptor>::print_loops(std::ostream &os) const {
203 for (u32 i = 0; i < loops.size(); ++i) {
204 const auto &loop = loops[i];
205 os << std::format(" {}: level {}, parent {}, {}->{}\n",
206 i,
207 loop.level,
208 loop.parent,
209 static_cast<u32>(loop.begin),
210 static_cast<u32>(loop.end));
211 }
212}
213
214template <IRAdaptor Adaptor>
215void Analyzer<Adaptor>::print_liveness(std::ostream &os) const {
216 for (u32 i = 0; i <= liveness_max_value; ++i) {
217 if (liveness[i].epoch != liveness_epoch) {
218 os << std::format(" {}: ignored\n", i);
219 continue;
220 }
221
222 const auto &info = liveness[i];
223 os << std::format(" {}: {} refs, {}->{} ({}->{}), lf: {}\n",
224 i,
225 info.ref_count,
226 static_cast<u32>(info.first),
227 static_cast<u32>(info.last),
228 adaptor->block_fmt_ref(block_ref(info.first)),
229 adaptor->block_fmt_ref(block_ref(info.last)),
230 info.last_full);
231 }
232}
233
234template <IRAdaptor Adaptor>
235typename Analyzer<Adaptor>::LivenessInfo &
236 Analyzer<Adaptor>::liveness_maybe(const IRValueRef val) noexcept {
237 const ValLocalIdx val_idx = adaptor->val_local_idx(val);
238 if constexpr (Adaptor::TPDE_PROVIDES_HIGHEST_VAL_IDX) {
239 assert(liveness.size() > static_cast<u32>(val_idx));
240 return liveness[static_cast<u32>(val_idx)];
241 } else {
242 if (liveness_max_value <= static_cast<u32>(val_idx)) {
243 liveness_max_value = static_cast<u32>(val_idx);
244 if (liveness.size() <= liveness_max_value) {
245 // TODO: better growth strategy?
246 liveness.resize(liveness_max_value + 0x100);
247 }
248 }
249 return liveness[static_cast<u32>(val_idx)];
250 }
251}
252
253template <IRAdaptor Adaptor>
254void Analyzer<Adaptor>::build_block_layout() {
255 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> block_rpo{};
256 build_rpo_block_order(block_rpo);
257
258 util::SmallVector<u32, SMALL_BLOCK_NUM> loop_parent{};
259 util::SmallBitSet<256> loop_heads{};
260
261 // TODO(ts): print out this step?
262 identify_loops(block_rpo, loop_parent, loop_heads);
263 assert(loop_parent.size() == block_rpo.size());
264 // the entry block is always the loop head for the root loop
265 loop_heads.mark_set(0);
266
267 build_loop_tree_and_block_layout(block_rpo, loop_parent, loop_heads);
268}
269
270template <IRAdaptor Adaptor>
271void Analyzer<Adaptor>::build_loop_tree_and_block_layout(
272 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
273 const util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
274 const util::SmallBitSet<256> &loop_heads) {
275 // TODO(ts): maybe merge this into the block_rpo?
276 struct BlockLoopInfo {
277 u32 loop_idx;
278 u32 rpo_idx;
279 };
280
281 util::SmallVector<BlockLoopInfo, SMALL_BLOCK_NUM> loop_blocks;
282 loop_blocks.resize_uninitialized(block_rpo.size());
283 for (u32 i = 0; i < block_rpo.size(); ++i) {
284 loop_blocks[i] = BlockLoopInfo{~0u, i};
285 }
286
287 // if we have not seen the parent loop before, we need to recursively insert
288 // them.
289 // the recursive call only happens with irreducible control-flow
290 const auto build_or_get_parent_loop = [&](const u32 i,
291 const auto &self) -> u32 {
292 const auto parent = loop_parent[i];
293 if (loop_blocks[parent].loop_idx != ~0u) {
294 // we have already seen that block and given it a loop
295 return loop_blocks[parent].loop_idx;
296 } else {
297 // get the parent loop and build the loop for this block
298 const auto parent_loop_idx = self(parent, self);
299 const auto loop_idx = loops.size();
300 loops.push_back(Loop{.level = loops[parent_loop_idx].level + 1,
301 .parent = parent_loop_idx});
302 loop_blocks[parent].loop_idx = loop_idx;
303 return loop_idx;
304 }
305 };
306
307 // entry is always the head of the top-level loop
308 loops.clear();
309 loops.push_back(Loop{.level = 0, .parent = 0, .num_blocks = 1});
310 loop_blocks[0].loop_idx = 0;
311
312 for (u32 i = 1; i < loop_parent.size(); ++i) {
313 const u32 parent_loop =
314 build_or_get_parent_loop(i, build_or_get_parent_loop);
315
316 if (loop_heads.is_set(i)) {
317 // if the loop is irreducible, we might have already inserted it so
318 // check for that.
319 //
320 // NOTE: we could also get away with unsetting loop_heads for that
321 // loop if it is irreducible if we would not count the loop_head in
322 // its own loop's num_blocks but in its parent
323 auto loop_idx = loop_blocks[i].loop_idx;
324 if (loop_idx == ~0u) [[likely]] {
325 loop_idx = loops.size();
326 loops.push_back(
327 Loop{.level = loops[parent_loop].level + 1, .parent = parent_loop});
328 loop_blocks[i].loop_idx = loop_idx;
329 }
330 ++loops[loop_idx].num_blocks;
331 } else {
332 loop_blocks[i].loop_idx = parent_loop;
333 ++loops[parent_loop].num_blocks;
334 }
335 }
336
337 // acummulate the total number of blocks in a loop by iterating over them in
338 // reverse-order. this works since we always push the parents first
339 // leave out the first loop since it's its own parent
340 for (u32 i = loops.size() - 1; i > 0; --i) {
341 const auto &loop = loops[i];
342 loops[loop.parent].num_blocks += loop.num_blocks;
343 }
344
345 assert(loops[0].num_blocks == block_rpo.size());
346
347 // now layout the blocks by iterating in RPO and either place them at the
348 // current offset of the parent loop or, if they are a new loop, place the
349 // whole loop at the offset of the parent. this will ensure that blocks
350 // surrounded by loops will also be layouted that way in the final order.
351 //
352 // however, the way this is implemented causes the loop index to not
353 // correspond 1:1 with the final layout, though they will still be tightly
354 // packed and only the order inside a loop may change. note(ts): this could
355 // be mitigated with another pass i think.
356 //
357 // NB: we don't clear block_layout/block_loop_map, all entries are overwritten
358 block_layout.resize_uninitialized(block_rpo.size());
359 // TODO(ts): merge this with block_layout for less malloc calls?
360 // however, we will mostly need this in the liveness computation so it may
361 // be better for cache utilization to keep them separate
362 block_loop_map.resize_uninitialized(block_rpo.size());
363
364 loops[0].begin = loops[0].end = static_cast<BlockIndex>(0);
365
366 const auto layout_loop = [&](const u32 loop_idx, const auto &self) -> void {
367 assert(loops[loop_idx].begin == INVALID_BLOCK_IDX);
368 const auto parent = loops[loop_idx].parent;
369 if (loops[parent].begin == INVALID_BLOCK_IDX) {
370 // should only happen with irreducible control-flow
371 self(parent, self);
372 }
373
374 const auto loop_begin = loops[parent].end;
375 loops[parent].end = static_cast<BlockIndex>(static_cast<u32>(loop_begin) +
376 loops[loop_idx].num_blocks);
377 assert(static_cast<u32>(loops[parent].end) -
378 static_cast<u32>(loops[parent].begin) <=
379 loops[parent].num_blocks);
380
381 loops[loop_idx].begin = loops[loop_idx].end = loop_begin;
382 };
383
384 for (u32 i = 0u; i < block_rpo.size(); ++i) {
385 const auto loop_idx = loop_blocks[i].loop_idx;
386 if (loops[loop_idx].begin == INVALID_BLOCK_IDX) {
387 layout_loop(loop_idx, layout_loop);
388 }
389
390 const auto block_ref = block_rpo[loop_blocks[i].rpo_idx];
391 const auto block_idx = static_cast<u32>(loops[loop_idx].end);
392 loops[loop_idx].end = static_cast<BlockIndex>(block_idx + 1);
393
394 block_layout[block_idx] = block_ref;
395 block_loop_map[block_idx] = loop_idx;
396 adaptor->block_set_info(block_ref, block_idx);
397 }
398
399 assert(static_cast<u32>(loops[0].end) == block_rpo.size());
400}
401
402template <IRAdaptor Adaptor>
403void Analyzer<Adaptor>::build_rpo_block_order(
404 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &out) const noexcept {
405 out.clear();
406
407 u32 num_blocks = 0;
408 {
409 // Initialize the block info
410 u32 idx = 0;
411 for (IRBlockRef cur : adaptor->cur_blocks()) {
412 adaptor->block_set_info(cur, idx);
413 adaptor->block_set_info2(cur, 0);
414 ++idx;
415 }
416 num_blocks = idx;
417 }
418 out.resize_uninitialized(num_blocks);
419
420 // implement the RPO generation using a simple stack that also walks in
421 // post-order and then reverse at the end. However, consider the following
422 // CFG
423 //
424 // A:
425 // - B:
426 // - D
427 // - E
428 // - C:
429 // - F
430 // - G
431 //
432 // which has valid RPOs A B D E C F G, A B D E C G F, A B E D C F G, ...
433 // which is not very nice for loops since in many IRs the order of the block
434 // has some meaning for the layout so we sort the pushed children by the
435 // order in the block list
436 util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM / 2> stack;
437 stack.push_back(adaptor->cur_entry_block());
438
439 // NOTE(ts): because we process children in reverse order
440 // this gives a bit funky results with irreducible loops
441 // but it should be fine I think
442 auto rpo_idx = num_blocks - 1;
443 while (!stack.empty()) {
444 const auto cur_node = stack.back();
445
446 // have we already added the block to the RPO list?
447 // if so we just skip it
448 if (adaptor->block_info2(cur_node) & 0b1000) {
449 stack.pop_back();
450 continue;
451 }
452
453 // have we already pushed the children of the nodes to the stack and
454 // processed them? if so we can add the block to the post-order,
455 // otherwise add the children and wait for them to be processed
456 if (adaptor->block_info2(cur_node) & 0b100) {
457 stack.pop_back();
458 // set the blocks RPO index and push it to the RPO list
459 adaptor->block_set_info(cur_node, rpo_idx);
460 // mark the block as already being on the RPO list
461 adaptor->block_set_info2(cur_node,
462 adaptor->block_info2(cur_node) | 0b1000);
463 out[rpo_idx] = cur_node;
464 --rpo_idx;
465 continue;
466 }
467
468 // mark as visited and add the successors
469 adaptor->block_set_info2(cur_node, adaptor->block_info2(cur_node) | 0b100);
470
471 const auto start_idx = stack.size();
472 // push the successors onto the stack to visit them
473 for (const auto succ : adaptor->block_succs(cur_node)) {
474 assert(succ != adaptor->cur_entry_block());
475 const u32 info = adaptor->block_info2(succ);
476 if ((info & 0b11) != 0) {
477 if ((info & 0b11) == 1) {
478 // note that succ has more than one incoming edge
479 adaptor->block_set_info2(succ, (info & ~0b11) | 2);
480 }
481 } else {
482 // TODO(ts): if we just do a post-order traversal and reverse
483 // everything at the end we could use only block_info to check
484 // whether we already saw a block. And I forgot why?
485 adaptor->block_set_info2(succ, info | 1);
486 }
487
488 // if the successor is already on the rpo list or it has been
489 // already visited and children added
490 if (adaptor->block_info2(succ) & 0b1100) {
491 continue;
492 }
493
494 stack.push_back(succ);
495 }
496
497 // Order the pushed children by their original block
498 // index since the children get visited in reverse and then inserted
499 // in reverse order in the rpo list @_@
500 const auto len = stack.size() - start_idx;
501 if (len <= 1) {
502 continue;
503 }
504
505 if (len == 2) {
506 if (adaptor->block_info(stack[start_idx]) >
507 adaptor->block_info(stack[start_idx + 1])) {
508 std::swap(stack[start_idx], stack[start_idx + 1]);
509 }
510 continue;
511 }
512
513 std::sort(stack.begin() + start_idx,
514 stack.end(),
515 [this](const IRBlockRef lhs, const IRBlockRef rhs) {
516 // note(ts): this may have not so nice performance
517 // characteristics if the block lookup is a hashmap so
518 // maybe cache this for larger lists?
519 return adaptor->block_info(lhs) < adaptor->block_info(rhs);
520 });
521 }
522
523 if (rpo_idx != 0xFFFF'FFFF) {
524 // there are unreachable blocks
525 // so we did not fill up the whole array and need to shift it
526 // TODO(ts): benchmark this against filling up a vector and always
527 // reversing it tho it should be better i think
528 out.erase(out.begin(), out.begin() + 1 + rpo_idx);
529
530 // need to fixup the RPO index for blocks as well :/
531 for (auto i = 0u; i < out.size(); ++i) {
532 adaptor->block_set_info(out[i], i);
533 }
534
535#ifndef NDEBUG
536 // In debug builds, reset block index of unreachable blocks.
537 for (IRBlockRef cur : adaptor->cur_blocks()) {
538 if (adaptor->block_info2(cur) == 0) {
539 adaptor->block_set_info(cur, 0xFFFF'FFFF);
540 }
541 }
542#endif
543 }
544
545#ifdef TPDE_LOGGING
546 TPDE_LOG_TRACE("Finished building RPO for blocks:");
547 for (u32 i = 0; i < out.size(); ++i) {
548 TPDE_LOG_TRACE("Index {}: {}", i, adaptor->block_fmt_ref(out[i]));
549 }
550#endif
551}
552
553template <IRAdaptor Adaptor>
554void Analyzer<Adaptor>::identify_loops(
555 const util::SmallVector<IRBlockRef, SMALL_BLOCK_NUM> &block_rpo,
556 util::SmallVector<u32, SMALL_BLOCK_NUM> &loop_parent,
557 util::SmallBitSet<256> &loop_heads) const noexcept {
558 loop_parent.clear();
559 loop_parent.resize(block_rpo.size());
560 loop_heads.clear();
561 loop_heads.resize(block_rpo.size());
562
563 // Implement the modified algorithm from Wei et al.: A New Algorithm for
564 // Identifying Loops in Decompilation
565 // in a non-recursive form
566
567 // TODO(ts): ask the adaptor if it can store this information for us?
568 // then we could save the allocation here
569 struct BlockInfo {
570 bool traversed;
571 bool self_loop;
572 u32 dfsp_pos;
573 u32 iloop_header;
574 };
575
576 util::SmallVector<BlockInfo, SMALL_BLOCK_NUM> block_infos;
577 block_infos.resize(block_rpo.size());
578
579 struct TravState {
580 u32 block_idx; // b0 from the paper
581 u32 dfsp_pos;
582 u32 nh;
583 u8 state = 0;
584 decltype(adaptor->block_succs(adaptor->cur_entry_block()).begin()) succ_it;
585 decltype(adaptor->block_succs(adaptor->cur_entry_block()).end()) end_it;
586 };
587
588 // The algorithm will reach the depth of the CFG so the small vector needs
589 // to be relatively big
590 // TODO(ts); maybe use the recursive form for small CFGs since that may be
591 // faster or use stack switching since the non-recursive version is really
592 // ugly
593 util::SmallVector<TravState, SMALL_BLOCK_NUM> trav_state;
594
595 trav_state.push_back(
596 TravState{.block_idx = 0,
597 .dfsp_pos = 1,
598 .succ_it = adaptor->block_succs(block_rpo[0]).begin(),
599 .end_it = adaptor->block_succs(block_rpo[0]).end()});
600
601 const auto tag_lhead = [&block_infos](u32 b, u32 h) {
602 if (b == h || h == 0) {
603 return;
604 }
605
606 auto cur1 = b, cur2 = h;
607 while (block_infos[cur1].iloop_header != 0) {
608 const auto ih = block_infos[cur1].iloop_header;
609 if (ih == cur2) {
610 return;
611 }
612 if (block_infos[ih].dfsp_pos < block_infos[cur2].dfsp_pos) {
613 block_infos[cur1].iloop_header = cur2;
614 cur1 = cur2;
615 cur2 = ih;
616 } else {
617 cur1 = ih;
618 }
619 }
620 block_infos[cur1].iloop_header = cur2;
621 };
622
623 // TODO(ts): this can be optimized and more condensed so that the state
624 // variable becomes unnecessary
625 while (!trav_state.empty()) {
626 auto &state = trav_state.back();
627 const auto block_idx = state.block_idx;
628 switch (state.state) {
629 case 0: {
630 // entry
631 block_infos[block_idx].traversed = true;
632 block_infos[block_idx].dfsp_pos = state.dfsp_pos;
633
634 // TODO(ts): somehow make sure that the iterators can live when
635 // succs is destroyed?
636 // auto succs = adaptor->block_succs(block_rpo[block_idx]);
637 // state.succ_it = succs.begin();
638 // state.end_it = succs.end();
639 state.state = 1;
640 }
641 [[fallthrough]];
642 case 1: {
643 loop_inner:
644 auto cont = false;
645 while (state.succ_it != state.end_it) {
646 const auto succ_idx = adaptor->block_info(*state.succ_it);
647 if (succ_idx == block_idx) {
648 block_infos[block_idx].self_loop = true;
649 }
650
651 if (block_infos[succ_idx].traversed) {
652 if (block_infos[succ_idx].dfsp_pos > 0) {
653 tag_lhead(block_idx, succ_idx);
654 } else if (block_infos[succ_idx].iloop_header != 0) {
655 auto h_idx = block_infos[succ_idx].iloop_header;
656 if (block_infos[h_idx].dfsp_pos > 0) {
657 tag_lhead(block_idx, h_idx);
658 } else {
659 while (block_infos[h_idx].iloop_header != 0) {
660 h_idx = block_infos[h_idx].iloop_header;
661 if (block_infos[h_idx].dfsp_pos > 0) {
662 tag_lhead(block_idx, h_idx);
663 break;
664 }
665 }
666 }
667 }
668 ++state.succ_it;
669 continue;
670 }
671
672 // recurse
673 cont = true;
674 state.state = 2;
675 // TODO(ts): somehow make sure that the iterators can live when
676 // succs is destroyed?
677 auto succs = adaptor->block_succs(block_rpo[succ_idx]);
678 trav_state.push_back(TravState{.block_idx = succ_idx,
679 .dfsp_pos = state.dfsp_pos + 1,
680 .succ_it = succs.begin(),
681 .end_it = succs.end()});
682 break;
683 }
684
685 if (cont) {
686 continue;
687 }
688
689 block_infos[block_idx].dfsp_pos = 0;
690 trav_state.pop_back();
691 if (trav_state.empty()) {
692 break;
693 }
694 trav_state.back().nh = block_infos[block_idx].iloop_header;
695 continue;
696 }
697 case 2: {
698 const auto nh = state.nh;
699 tag_lhead(block_idx, nh);
700
701 ++state.succ_it;
702 state.state = 1;
703 goto loop_inner;
704 }
705 }
706 }
707
708 for (u32 i = 0; i < block_rpo.size(); ++i) {
709 auto &info = block_infos[i];
710 if (info.iloop_header != 0) {
711 loop_parent[i] = info.iloop_header;
712 loop_heads.mark_set(info.iloop_header);
713 }
714 if (info.self_loop) {
715 loop_heads.mark_set(i);
716 }
717 }
718}
719
720template <IRAdaptor Adaptor>
721void Analyzer<Adaptor>::compute_liveness() noexcept {
722 // implement the liveness algorithm described in
723 // http://databasearchitects.blogspot.com/2020/04/linear-time-liveness-analysis.html
724 // and Kohn et al.: Adaptive Execution of Compiled Queries
725 // TODO(ts): also expose the two-pass liveness algo as an option
726
727 TPDE_LOG_TRACE("Starting Liveness Analysis");
728
729 // Bump epoch. On overflow, we must clear all liveness info entries.
730 if (++liveness_epoch == 0) {
731 liveness.clear();
732 liveness_epoch = 1;
733 }
734
735 if constexpr (Adaptor::TPDE_PROVIDES_HIGHEST_VAL_IDX) {
736 liveness_max_value = adaptor->cur_highest_val_idx();
737 if (liveness_max_value >= liveness.size()) {
738 liveness.resize(liveness_max_value + 1);
739 }
740 } else {
741 liveness_max_value = 0;
742 }
743
744 num_insts = 0;
745
746 const auto visit = [this](const IRValueRef value, const u32 block_idx) {
747 TPDE_LOG_TRACE(" Visiting value {} in block {}",
748 adaptor->value_fmt_ref(value),
749 block_idx);
750 if (adaptor->val_ignore_in_liveness_analysis(value)) {
751 TPDE_LOG_TRACE(" value is ignored");
752 return;
753 }
754
755 auto &liveness = liveness_maybe(value);
756 if (liveness.epoch != liveness_epoch) {
757 TPDE_LOG_TRACE(" initializing liveness info, lcl is {}",
758 block_loop_map[block_idx]);
759 liveness = LivenessInfo{
760 .first = static_cast<BlockIndex>(block_idx),
761 .last = static_cast<BlockIndex>(block_idx),
762 .ref_count = 1,
763 .lowest_common_loop = block_loop_map[block_idx],
764 .last_full = false,
765 .epoch = liveness_epoch,
766 };
767 return;
768 }
769
770 assert(liveness.ref_count != ~0u && "used value without definition");
771
772 ++liveness.ref_count;
773 TPDE_LOG_TRACE(" increasing ref_count to {}", liveness.ref_count);
774
775 // helpers
776 const auto update_for_block_only =
777 [&liveness, block_idx](bool check_for_backwards_extension) {
778 const auto old_first = static_cast<u32>(liveness.first);
779 const auto old_last = static_cast<u32>(liveness.last);
780
781 const auto new_first = std::min(old_first, block_idx);
782 const auto new_last = std::max(old_last, block_idx);
783 liveness.first = static_cast<BlockIndex>(new_first);
784 liveness.last = static_cast<BlockIndex>(new_last);
785
786 // if last changed, we don't need to extend the lifetime to the end
787 // of the last block except if we extended the liveness interval
788 // to an outside loop which means we saw a use in a previous block
789 if (old_last == new_last && check_for_backwards_extension) {
790 liveness.last_full = true;
791 } else if (old_last != new_last) {
792 liveness.last_full = false;
793 }
794 };
795
796 const auto update_for_loop = [&liveness](
797 const Loop &loop,
798 bool check_for_backwards_extension) {
799 const auto old_first = static_cast<u32>(liveness.first);
800 const auto old_last = static_cast<u32>(liveness.last);
801
802 const auto new_first = std::min(old_first, static_cast<u32>(loop.begin));
803 const auto new_last = std::max(old_last, static_cast<u32>(loop.end) - 1);
804 liveness.first = static_cast<BlockIndex>(new_first);
805 liveness.last = static_cast<BlockIndex>(new_last);
806
807 // if last changed, set last_full to true
808 // since the values need to be allocated when the loop is active
809 // Otherwise, if the liveness interval was extended backwards we need to
810 // mark the liveness intervall as full as well
811 if (old_last != new_last || check_for_backwards_extension) {
812 liveness.last_full = true;
813 }
814 };
815
816 const auto block_loop_idx = block_loop_map[block_idx];
817 if (liveness.lowest_common_loop == block_loop_idx) {
818 // just extend the liveness interval
819 TPDE_LOG_TRACE(" lcl is same as block loop");
820 update_for_block_only(false);
821
822 TPDE_LOG_TRACE(" new interval {}->{}, lf: {}",
823 static_cast<u32>(liveness.first),
824 static_cast<u32>(liveness.last),
825 liveness.last_full);
826 return;
827 }
828
829 const Loop &liveness_loop = loops[liveness.lowest_common_loop];
830 const Loop &block_loop = loops[block_loop_idx];
831
832 if (liveness_loop.level < block_loop.level &&
833 static_cast<u32>(block_loop.begin) <
834 static_cast<u32>(liveness_loop.end)) {
835 assert(static_cast<u32>(block_loop.end) <=
836 static_cast<u32>(liveness_loop.end));
837
838 TPDE_LOG_TRACE(" block_loop {} is nested inside lcl", block_loop_idx);
839 // The current use is nested inside the loop of the liveness
840 // interval so we only need to get the loop at level
841 // (liveness_loop.level + 1) that contains block_loop and extend the
842 // liveness interval
843 const auto target_level = liveness_loop.level + 1;
844 auto cur_loop_idx = block_loop_idx;
845 auto cur_level = block_loop.level;
846 // TODO(ts): we could also skip some loops here since we know that
847 // there have to be at least n=(cur_level-target_level) loops
848 // between cur_loop and target_loop
849 // so we could choose cur_loop_idx = min(cur_parent, cur_idx-n)?
850 // however this might jump into more nested loops
851 // so maybe check (cur_idx-n).level first?
852 while (cur_level != target_level) {
853 cur_loop_idx = loops[cur_loop_idx].parent;
854 --cur_level;
855 }
856 assert(loops[cur_loop_idx].level == target_level);
857 TPDE_LOG_TRACE(" target_loop is {}", cur_loop_idx);
858 update_for_loop(loops[cur_loop_idx], false);
859
860 TPDE_LOG_TRACE(" new interval {}->{}, lf: {}",
861 static_cast<u32>(liveness.first),
862 static_cast<u32>(liveness.last),
863 liveness.last_full);
864 return;
865 }
866
867 TPDE_LOG_TRACE(" block_loop {} is nested higher or in a different loop",
868 block_loop_idx);
869 // need to update the lowest common loop to contain both liveness_loop
870 // and block_loop and then extend the interval accordingly
871
872 // TODO(ts): this algorithm is currently worst-case O(n) which makes the
873 // whole liveness analysis worst-case O(n^2) which is not good. However,
874 // we expect the number of loops to be much smaller than the number of
875 // values so it should be fine for most programs. To be safe, we should
876 // implement the algorithm from "Gusfield: Constant-Time Lowest Common
877 // Ancestor Retrieval" which seems to be the one with the most
878 // reasonable overhead. It is however not zero and the queries are also
879 // not exactly free so we should definitely benchmark both version first
880 // with both small and large programs
881
882 auto lhs_idx = liveness.lowest_common_loop;
883 auto rhs_idx = block_loop_idx;
884 auto prev_rhs = rhs_idx;
885 auto prev_lhs = lhs_idx;
886 while (lhs_idx != rhs_idx) {
887 const auto lhs_level = loops[lhs_idx].level;
888 const auto rhs_level = loops[rhs_idx].level;
889 if (lhs_level > rhs_level) {
890 prev_lhs = lhs_idx;
891 lhs_idx = loops[lhs_idx].parent;
892 } else if (lhs_level < rhs_level) {
893 prev_rhs = rhs_idx;
894 rhs_idx = loops[rhs_idx].parent;
895 } else {
896 prev_lhs = lhs_idx;
897 prev_rhs = rhs_idx;
898 lhs_idx = loops[lhs_idx].parent;
899 rhs_idx = loops[rhs_idx].parent;
900 }
901 }
902
903 assert(static_cast<u32>(loops[lhs_idx].begin) <=
904 static_cast<u32>(liveness_loop.begin));
905 assert(static_cast<u32>(loops[lhs_idx].end) >=
906 static_cast<u32>(liveness_loop.end));
907 TPDE_LOG_TRACE(" new lcl is {}", lhs_idx);
908
909 liveness.lowest_common_loop = lhs_idx;
910
911 // extend for the full loop that contains liveness_loop and is nested
912 // directly in lcl
913 assert(loops[prev_lhs].parent == lhs_idx);
914 assert(static_cast<u32>(loops[prev_lhs].begin) <=
915 static_cast<u32>(liveness_loop.begin));
916 assert(static_cast<u32>(loops[prev_lhs].end) >=
917 static_cast<u32>(liveness_loop.end));
918 update_for_loop(loops[prev_lhs], false);
919
920 // extend by block if the block_loop is the lcl
921 // or by prev_rhs (the loop containing block_loop nested directly in
922 // lcl) otherwise
923 if (lhs_idx == block_loop_idx) {
924 update_for_block_only(true);
925 } else {
926 assert(loops[prev_rhs].parent == lhs_idx);
927 assert(loops[prev_rhs].level == loops[lhs_idx].level + 1);
928 update_for_loop(loops[prev_rhs], true);
929 }
930
931 TPDE_LOG_TRACE(" new interval {}->{}, lf: {}",
932 static_cast<u32>(liveness.first),
933 static_cast<u32>(liveness.last),
934 liveness.last_full);
935 };
936
937 assert(block_layout[0] == adaptor->cur_entry_block());
938 if constexpr (Adaptor::TPDE_LIVENESS_VISIT_ARGS) {
939 for (const IRValueRef arg : adaptor->cur_args()) {
940 visit(arg, 0);
941 }
942 }
943
944 for (u32 block_idx = 0; block_idx < block_layout.size(); ++block_idx) {
945 IRBlockRef block = block_layout[block_idx];
946 TPDE_LOG_TRACE(
947 "Analyzing block {} ('{}')", block_idx, adaptor->block_fmt_ref(block));
948 const auto block_loop_idx = block_loop_map[block_idx];
949
950 bool has_phis = false;
951 for (const IRValueRef phi : adaptor->block_phis(block)) {
952 TPDE_LOG_TRACE("Analyzing phi {}", adaptor->value_fmt_ref(phi));
953 has_phis = true;
954
955 const auto phi_ref = adaptor->val_as_phi(phi);
956 const u32 slot_count = phi_ref.incoming_count();
957 for (u32 i = 0; i < slot_count; ++i) {
958 const IRBlockRef incoming_block = phi_ref.incoming_block_for_slot(i);
959 const IRValueRef incoming_value = phi_ref.incoming_val_for_slot(i);
960 if (adaptor->block_info2(incoming_block) == 0) {
961 TPDE_LOG_TRACE("ignoring phi input from unreachable pred ({})",
962 adaptor->block_fmt_ref(incoming_block));
963 continue;
964 }
965 const auto incoming_block_idx = adaptor->block_info(incoming_block);
966
967 // mark the incoming value as used in the incoming block
968 visit(incoming_value, incoming_block_idx);
969 // mark the PHI-value as used in the incoming block
970 visit(phi, incoming_block_idx);
971 }
972 }
973
974 if (has_phis) {
975 adaptor->block_set_info2(block, adaptor->block_info2(block) | 0b1'0000);
976 }
977
978 for (const IRInstRef inst : adaptor->block_insts(block)) {
979 TPDE_LOG_TRACE("Analyzing instruction {}", adaptor->inst_fmt_ref(inst));
980 for (const IRValueRef res : adaptor->inst_results(inst)) {
981 // mark the value as used in the current block
982 visit(res, block_idx);
983 ++loops[block_loop_idx].definitions;
984 }
985
986 for (const IRValueRef operand : adaptor->inst_operands(inst)) {
987 visit(operand, block_idx);
988 }
989
990 num_insts += 1;
991 }
992 }
993
994 // fill out the definitions_in_childs counters
995 // (skip 0 since it has itself as a parent)
996 for (u32 idx = loops.size() - 1; idx != 0; --idx) {
997 auto &loop = loops[idx];
998 loops[loop.parent].definitions_in_childs +=
999 loop.definitions_in_childs + loop.definitions;
1000 }
1001
1002#ifdef TPDE_ASSERTS
1003 // reset the incorrect ref_counts in the liveness infos
1004 for (auto &entry : liveness) {
1005 if (entry.ref_count == ~0u) {
1006 entry.ref_count = 0;
1007 }
1008 }
1009#endif
1010
1011 TPDE_LOG_TRACE("Finished Liveness Analysis");
1012}
1013
1014} // namespace tpde