TPDE
Loading...
Searching...
No Matches
CompilerA64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/DWARF.hpp"
10#include "tpde/ELF.hpp"
11#include "tpde/arm64/FunctionWriterA64.hpp"
12#include "tpde/base.hpp"
13#include "tpde/util/SmallVector.hpp"
14#include "tpde/util/misc.hpp"
15
16#include <bit>
17#include <disarm64.h>
18
19// Helper macros for assembling in the compiler
20#if defined(ASM) || defined(ASMNC) || defined(ASMC)
21 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
22#endif
23
24/// Encode an instruction with an explicit compiler pointer
25#define ASMC(compiler, op, ...) \
26 ((compiler)->text_writer.write_inst(de64_##op(__VA_ARGS__)))
27/// Encode an instruction into this
28#define ASM(...) ASMC(this, __VA_ARGS__)
29/// Encode an instruction without checking that enough space is available
30#define ASMNC(op, ...) \
31 (this->text_writer.write_inst_unchecked(de64_##op(__VA_ARGS__)))
32/// Encode an instruction if the encoding is successful (returns true)
33#define ASMIFC(compiler, op, ...) \
34 ((compiler)->text_writer.try_write_inst(de64_##op(__VA_ARGS__)))
35/// Encode an instruction if the encoding is successful (returns true)
36#define ASMIF(...) ASMIFC(this, __VA_ARGS__)
37
38namespace tpde::a64 {
39
40struct AsmReg : Reg {
41 enum REG : u8 {
42 R0 = 0,
43 R1,
44 R2,
45 R3,
46 R4,
47 R5,
48 R6,
49 R7,
50 R8,
51 R9,
52 R10,
53 R11,
54 R12,
55 R13,
56 R14,
57 R15,
58 R16,
59 R17,
60 R18,
61 R19,
62 R20,
63 R21,
64 R22,
65 R23,
66 R24,
67 R25,
68 R26,
69 R27,
70 R28,
71 R29,
72 FP = 29,
73 R30,
74 LR = 30,
75 SP = 31,
76
77 V0 = 32,
78 V1,
79 V2,
80 V3,
81 V4,
82 V5,
83 V6,
84 V7,
85 V8,
86 V9,
87 V10,
88 V11,
89 V12,
90 V13,
91 V14,
92 V15,
93 V16,
94 V17,
95 V18,
96 V19,
97 V20,
98 V21,
99 V22,
100 V23,
101 V24,
102 V25,
103 V26,
104 V27,
105 V28,
106 V29,
107 V30,
108 V31
109 };
110
111 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
112
113 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
114
115 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
116
117 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
118 assert(id <= SP || (id >= V0 && id <= V31));
119 }
120
121 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
122 assert(id <= SP || (id >= V0 && id <= V31));
123 }
124
125 operator DA_GReg() const noexcept {
126 assert(reg_id < V0);
127 return DA_GReg{reg_id};
128 }
129
130 operator DA_GRegZR() const noexcept {
131 assert(reg_id < V0);
132 assert(reg_id != SP); // 31 means SP in our enums
133 return DA_GRegZR{reg_id};
134 }
135
136 operator DA_GRegSP() const noexcept {
137 assert(reg_id <= SP);
138 return DA_GRegSP{reg_id};
139 }
140
141 operator DA_VReg() const noexcept {
142 assert(reg_id >= V0 && reg_id <= V31);
143 return DA_VReg{static_cast<u8>(reg_id - V0)};
144 }
145};
146
147constexpr static u64
148 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
149 u64 set = 0;
150 for (const auto reg : regs) {
151 set |= 1ull << reg;
152 }
153 return set;
154}
155
156template <size_t N>
157constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
158 u64 set = 0;
159 for (const auto reg : regs) {
160 set |= 1ull << reg.id();
161 }
162 return set;
163}
164
165/// AArch64 AAPCS calling convention.
166class CCAssignerAAPCS : public CCAssigner {
167 static constexpr CCInfo Info{
168 // we reserve SP,FP,R16 and R17 for our special use cases
169 .allocatable_regs =
170 0xFFFF'FFFF'FFFF'FFFF &
171 ~create_bitmask({AsmReg::SP, AsmReg::FP, AsmReg::R16, AsmReg::R17}),
172 // callee-saved registers
173 .callee_saved_regs = create_bitmask({
174 AsmReg::R19,
175 AsmReg::R20,
176 AsmReg::R21,
177 AsmReg::R22,
178 AsmReg::R23,
179 AsmReg::R24,
180 AsmReg::R25,
181 AsmReg::R26,
182 AsmReg::R27,
183 AsmReg::R28,
184 AsmReg::V8,
185 AsmReg::V9,
186 AsmReg::V10,
187 AsmReg::V11,
188 AsmReg::V12,
189 AsmReg::V13,
190 AsmReg::V14,
191 AsmReg::V15,
192 }),
193 .arg_regs = create_bitmask({
194 AsmReg::R0,
195 AsmReg::R1,
196 AsmReg::R2,
197 AsmReg::R3,
198 AsmReg::R4,
199 AsmReg::R5,
200 AsmReg::R6,
201 AsmReg::R7,
202 AsmReg::R8, // sret register
203 AsmReg::V0,
204 AsmReg::V1,
205 AsmReg::V2,
206 AsmReg::V3,
207 AsmReg::V4,
208 AsmReg::V5,
209 AsmReg::V6,
210 AsmReg::V7,
211 }),
212 };
213
214 // NGRN = Next General-purpose Register Number
215 // NSRN = Next SIMD/FP Register Number
216 // NSAA = Next Stack Argument Address
217 u32 ngrn = 0, nsrn = 0, nsaa = 0;
218 u32 ret_ngrn = 0, ret_nsrn = 0;
219
220public:
221 CCAssignerAAPCS() noexcept : CCAssigner(Info) {}
222
223 void reset() noexcept override {
224 ngrn = nsrn = nsaa = ret_ngrn = ret_nsrn = 0;
225 }
226
227 void assign_arg(CCAssignment &arg) noexcept override {
228 if (arg.byval) [[unlikely]] {
229 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
230 arg.stack_off = nsaa;
231 nsaa += arg.size;
232 return;
233 }
234
235 if (arg.sret) [[unlikely]] {
236 arg.reg = AsmReg{AsmReg::R8};
237 return;
238 }
239
240 if (arg.bank == RegBank{0}) {
241 if (arg.align > 8) {
242 ngrn = util::align_up(ngrn, 2);
243 }
244 if (ngrn + arg.consecutive < 8) {
245 arg.reg = Reg{AsmReg::R0 + ngrn};
246 ngrn += 1;
247 } else {
248 ngrn = 8;
249 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
250 arg.stack_off = nsaa;
251 nsaa += 8;
252 }
253 } else {
254 if (nsrn + arg.consecutive < 8) {
255 arg.reg = Reg{AsmReg::V0 + nsrn};
256 nsrn += 1;
257 } else {
258 nsrn = 8;
259 u32 size = util::align_up(arg.size, 8);
260 nsaa = util::align_up(nsaa, size);
261 arg.stack_off = nsaa;
262 nsaa += size;
263 }
264 }
265 }
266
267 u32 get_stack_size() noexcept override { return nsaa; }
268
269 void assign_ret(CCAssignment &arg) noexcept override {
270 assert(!arg.byval && !arg.sret);
271 if (arg.bank == RegBank{0}) {
272 if (arg.align > 8) {
273 ret_ngrn = util::align_up(ret_ngrn, 2);
274 }
275 if (ret_ngrn + arg.consecutive < 8) {
276 arg.reg = Reg{AsmReg::R0 + ret_ngrn};
277 ret_ngrn += 1;
278 } else {
279 assert(false);
280 }
281 } else {
282 if (ret_nsrn + arg.consecutive < 8) {
283 arg.reg = Reg{AsmReg::V0 + ret_nsrn};
284 ret_nsrn += 1;
285 } else {
286 assert(false);
287 }
288 }
289 }
290};
291
292struct PlatformConfig : CompilerConfigDefault {
293 using Assembler = tpde::elf::AssemblerElfA64;
294 using AsmReg = tpde::a64::AsmReg;
295 using DefaultCCAssigner = CCAssignerAAPCS;
297
298 static constexpr RegBank GP_BANK{0};
299 static constexpr RegBank FP_BANK{1};
300 static constexpr bool FRAME_INDEXING_NEGATIVE = false;
301 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
302 static constexpr u32 NUM_BANKS = 2;
303};
304
305/// Compiler mixin for targeting AArch64.
306template <IRAdaptor Adaptor,
307 typename Derived,
308 template <typename, typename, typename> typename BaseTy =
309 CompilerBase,
310 typename Config = PlatformConfig>
311struct CompilerA64 : BaseTy<Adaptor, Derived, Config> {
312 using Base = BaseTy<Adaptor, Derived, Config>;
313
314 using IRValueRef = typename Base::IRValueRef;
315 using IRBlockRef = typename Base::IRBlockRef;
316 using IRFuncRef = typename Base::IRFuncRef;
317
318 using ScratchReg = typename Base::ScratchReg;
319 using ValuePartRef = typename Base::ValuePartRef;
320 using ValuePart = typename Base::ValuePart;
321 using GenericValuePart = typename Base::GenericValuePart;
322
323 using RegisterFile = typename Base::RegisterFile;
324
325 using CallArg = typename Base::CallArg;
326
327 using Base::derived;
328
329
330 // TODO(ts): make this dependent on the number of callee-saved regs of the
331 // current function or if there is a call in the function?
332 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
333 6};
334
335 enum CPU_FEATURES : u32 {
336 CPU_BASELINE = 0, // ARMV8.0
337 };
338
339 CPU_FEATURES cpu_feats = CPU_BASELINE;
340
341 // When handling function arguments, we need to prevent argument registers
342 // from being handed out as fixed registers
343 //
344 // Additionally, we prevent R0 and R1 from being fixed assignments to
345 // prevent issues with exception handling
346 u64 fixed_assignment_nonallocatable_mask =
347 create_bitmask({AsmReg::R0, AsmReg::R1});
348 u32 func_start_off = 0u, func_prologue_alloc = 0u;
349 /// Offset to the `add sp, sp, XXX` instruction that the argument handling
350 /// uses to access stack arguments if needed
352 AsmReg func_arg_stack_add_reg = AsmReg::make_invalid();
353
354 /// Permanent scratch register, e.g. to materialize constants/offsets. This is
355 /// used by materialize_constant, load_from_stack, spill_reg.
356 AsmReg permanent_scratch_reg = AsmReg::R16;
357
358 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
359 u32 reg_save_frame_off = 0;
360 util::SmallVector<u32, 8> func_ret_offs = {};
361
362 /// Helper class for building call sequences.
363 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
364 u32 stack_adjust_off = 0;
365 u32 stack_size = 0;
366 u32 stack_sub = 0;
367
368 void set_stack_used() noexcept;
369
370 public:
371 /// Constructor.
372 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
373 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
374
375 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
376 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
377 void call_impl(std::variant<SymRef, ValuePart> &&) noexcept;
378 void reset_stack() noexcept;
379 };
380
381 // for now, always generate an object
382 explicit CompilerA64(Adaptor *adaptor,
383 const CPU_FEATURES cpu_features = CPU_BASELINE)
384 : Base{adaptor}, cpu_feats(cpu_features) {
385 static_assert(std::is_base_of_v<CompilerA64, Derived>);
386 }
387
388 void start_func(u32) noexcept {}
389
390 /// Begin prologue, prepare for assigning arguments.
391 void prologue_begin(CCAssigner *cc_assigner) noexcept;
392 /// Assign argument part. Returns the stack offset if the value should be
393 /// initialized as stack variable.
394 std::optional<i32> prologue_assign_arg_part(ValuePart &&vp,
395 CCAssignment cca) noexcept;
396 /// Finish prologue.
397 void prologue_end(CCAssigner *cc_assigner) noexcept;
398
399 // note: this has to call assembler->end_func
400 void finish_func(u32 func_idx) noexcept;
401
402 void reset() noexcept;
403
404 // helpers
405
406 void gen_func_epilog() noexcept;
407
408 void
409 spill_reg(const AsmReg reg, const u32 frame_off, const u32 size) noexcept;
410
411 void load_from_stack(AsmReg dst,
412 i32 frame_off,
413 u32 size,
414 bool sign_extend = false) noexcept;
415
416 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
417
418 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
419
420 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
421 assert(ap.stack_valid() && !ap.variable_ref());
422 return typename GenericValuePart::Expr(AsmReg::R29, ap.frame_off());
423 }
424
425 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
426
427 /// Dynamic alloca of a fixed-size region.
428 void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept;
429
430 /// Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
431 /// count must have a size of 64 bit.
432 void alloca_dynamic(u64 elem_size,
433 ValuePart &&count,
434 u32 align,
435 ValuePart &res) noexcept;
436
437 /// Materialize constant into a register.
438 void materialize_constant(const u64 *data,
439 RegBank bank,
440 u32 size,
441 AsmReg dst) noexcept;
442 /// Materialize constant into a register.
443 void materialize_constant(u64 const_u64,
444 RegBank bank,
445 u32 size,
446 AsmReg dst) noexcept {
447 assert(size <= sizeof(const_u64));
448 materialize_constant(&const_u64, bank, size, dst);
449 }
450
451 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef) noexcept;
452
453 /// Jump conditions.
454 struct Jump {
455 // TDOO: naming consistency
456 enum Kind : uint8_t {
457 Jeq, ///< Equal (Z == 1)
458 Jne, ///< Not equal (Z == 0)
459 Jcs, ///< Carry set (C == 1)
460 Jhs = Jcs, ///< Unsigned higher or same (C == 1)
461 Jcc, ///< Carry clear (C == 0)
462 Jlo = Jcc, ///< Unsigned lower (C == 0)
463 Jmi, ///< Minus, negative (N == 1)
464 Jpl, ///< Plus, positive or zero (N == 0)
465 Jvs, ///< Overflow (V == 1)
466 Jvc, ///< No Overflow (V == 0)
467 Jhi, ///< Unsigned higher (C == 1 && Z == 0)
468 Jls, ///< Unsigned lower or same (!(C == 1 && Z == 0))
469 Jge, ///< Signed greater than or equal (N == V)
470 Jlt, ///< Signed less than (N != V)
471 Jgt, ///< Signed greater than (Z == 0 && N == V)
472 Jle, ///< Signed lessthan or equal (!(Z == 0 && N == V))
473 jmp, ///< Unconditional jump
474 Cbz, ///< Compare and branch if zero (Wn or Xn register)
475 Cbnz, ///< Compare and branch if not zero (Wn or Xn register)
476 Tbz, ///< Test single bit and branch if zero (Xn register)
477 Tbnz, ///< Test single bit and branch if not zero (Xn register)
478 };
479
480 Kind kind;
481 AsmReg cmp_reg;
482 bool cmp_is_32;
483 u8 test_bit;
484
485 /// Unconditional branch.
486 constexpr Jump() : kind(Kind::jmp) {}
487
488 /// Unconditional or conditional branch based on flags.
489 constexpr Jump(Kind kind) : kind(kind), cmp_is_32(false), test_bit(0) {
490 assert(kind != Cbz && kind != Cbnz && kind != Tbz && kind != Tbnz);
491 }
492
493 /// Cbz/Cbnz branch.
494 constexpr Jump(Kind kind, AsmReg cmp_reg, bool cmp_is_32)
495 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(cmp_is_32), test_bit(0) {
496 assert(kind == Cbz || kind == Cbnz);
497 }
498
499 /// Tbz/Tbnz branch.
500 constexpr Jump(Kind kind, AsmReg cmp_reg, u8 test_bit)
501 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(false), test_bit(test_bit) {
502 assert(kind == Tbz || kind == Tbnz);
503 }
504
505 constexpr Jump change_kind(Kind new_kind) const {
506 auto cpy = *this;
507 cpy.kind = new_kind;
508 return cpy;
509 }
510 };
511
512 Jump invert_jump(Jump jmp) noexcept;
513 Jump swap_jump(Jump jmp) noexcept;
514
515 /// Generate jump instruction to target label.
516 void generate_raw_jump(Jump jmp, Label target) noexcept;
517
518 /// Convert jump condition to disarms Da64Cond.
519 /// \warning Cbz,Cbnz,Tbz and Tbnz are not supported
520 Da64Cond jump_to_cond(Jump jmp) noexcept;
521 /// Set dst to 1 if cc is true, otherwise set it to zero
522 void generate_raw_set(Jump cc, AsmReg dst) noexcept;
523 /// Set all bits of dst to 1 if cc is true, otherwise set dst to zero
524 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
525
526 /// Moves true_select into dst if cc is true,
527 /// otherwise move false_select into dst
528 void generate_raw_select(Jump cc,
529 AsmReg dst,
530 AsmReg true_select,
531 AsmReg false_select,
532 bool is_64) noexcept;
533
534 /// Integer extension. src is not modified.
536 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
537
538 /// Bitfield insert. src is not modified.
539 void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept {
540 ASM(BFIx, dst, src, lsb, width);
541 }
542 /// Bitfield insert in zero. src is not modified.
543 void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept {
544 ASM(UBFIZx, dst, src, lsb, width);
545 }
546
547 /// Generate a function call
548 ///
549 /// This will get the arguments into the correct registers according to the
550 /// calling convention, clear non-callee-saved registers from the register
551 /// file (make sure you do not have any fixed assignments left over) and
552 /// fill the result registers (the u8 in the ScratchReg pair indicates the
553 /// register bank)
554 ///
555 /// Targets can be a symbol (call to PLT with relocation), or an indirect
556 /// call to a ValuePart. Result is an optional reference.
557 void generate_call(std::variant<SymRef, ValuePart> &&target,
558 std::span<CallArg> arguments,
559 typename Base::ValueRef *result,
560 bool variable_args = false);
561
562private:
563 /// @internal Emit compare of cmp_reg with case_value.
564 void switch_emit_cmp(AsmReg cmp_reg,
565 AsmReg tmp_reg,
566 u64 case_value,
567 bool width_is_32) noexcept;
568
569public:
570 /// @internal Jump if cmp_reg equals case_value.
571 void switch_emit_cmpeq(Label case_label,
572 AsmReg cmp_reg,
573 AsmReg tmp_reg,
574 u64 case_value,
575 bool width_is_32) noexcept;
576 /// @internal Emit bounds check and jump table.
577 bool switch_emit_jump_table(Label default_label,
578 std::span<const Label> labels,
579 AsmReg cmp_reg,
580 AsmReg tmp_reg,
581 u64 low_bound,
582 u64 high_bound,
583 bool width_is_32) noexcept;
584 /// @internal Jump if cmp_reg is greater than case_value.
585 void switch_emit_binary_step(Label case_label,
586 Label gt_label,
587 AsmReg cmp_reg,
588 AsmReg tmp_reg,
589 u64 case_value,
590 bool width_is_32) noexcept;
591
592 /// Generate code sequence to load address of sym into a register. This will
593 /// generate a function call for dynamic TLS access models.
594 ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept;
595
596 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
597 return ((cpu_feats & feats) == feats);
598 }
599};
600
601template <IRAdaptor Adaptor,
602 typename Derived,
603 template <typename, typename, typename> class BaseTy,
604 typename Config>
605void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
606 set_stack_used() noexcept {
607 if (stack_adjust_off == 0) {
608 this->compiler.text_writer.ensure_space(16);
609 stack_adjust_off = this->compiler.text_writer.offset();
610 this->compiler.text_writer.cur_ptr() += 4;
611 }
612}
613
614template <IRAdaptor Adaptor,
615 typename Derived,
616 template <typename, typename, typename> class BaseTy,
617 typename Config>
618void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
619 ValuePart &vp, CCAssignment &cca) noexcept {
620 AsmReg ptr_reg = vp.load_to_reg(&this->compiler);
621 AsmReg tmp_reg = AsmReg::R16;
622
623 auto size = cca.size;
624 set_stack_used();
625 for (u32 off = 0; off < size;) {
626 if (size - off >= 8) {
627 ASMC(&this->compiler, LDRxu, tmp_reg, ptr_reg, off);
628 ASMC(&this->compiler, STRxu, tmp_reg, DA_SP, cca.stack_off + off);
629 off += 8;
630 } else if (size - off >= 4) {
631 ASMC(&this->compiler, LDRwu, tmp_reg, ptr_reg, off);
632 ASMC(&this->compiler, STRwu, tmp_reg, DA_SP, cca.stack_off + off);
633 off += 4;
634 } else if (size - off >= 2) {
635 ASMC(&this->compiler, LDRHu, tmp_reg, ptr_reg, off);
636 ASMC(&this->compiler, STRHu, tmp_reg, DA_SP, cca.stack_off + off);
637 off += 2;
638 } else {
639 ASMC(&this->compiler, LDRBu, tmp_reg, ptr_reg, off);
640 ASMC(&this->compiler, STRBu, tmp_reg, DA_SP, cca.stack_off + off);
641 off += 1;
642 }
643 }
644}
645
646template <IRAdaptor Adaptor,
647 typename Derived,
648 template <typename, typename, typename> class BaseTy,
649 typename Config>
650void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
651 ValuePart &vp, CCAssignment &cca) noexcept {
652 set_stack_used();
653
654 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
655 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
656 switch (cca.size) {
657 case 1: ASMC(&this->compiler, STRBu, reg, DA_SP, cca.stack_off); break;
658 case 2: ASMC(&this->compiler, STRHu, reg, DA_SP, cca.stack_off); break;
659 case 4: ASMC(&this->compiler, STRwu, reg, DA_SP, cca.stack_off); break;
660 case 8: ASMC(&this->compiler, STRxu, reg, DA_SP, cca.stack_off); break;
661 default: TPDE_UNREACHABLE("invalid GP reg size");
662 }
663 } else {
664 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
665 switch (cca.size) {
666 case 1: ASMC(&this->compiler, STRbu, reg, DA_SP, cca.stack_off); break;
667 case 2: ASMC(&this->compiler, STRhu, reg, DA_SP, cca.stack_off); break;
668 case 4: ASMC(&this->compiler, STRsu, reg, DA_SP, cca.stack_off); break;
669 case 8: ASMC(&this->compiler, STRdu, reg, DA_SP, cca.stack_off); break;
670 case 16: ASMC(&this->compiler, STRqu, reg, DA_SP, cca.stack_off); break;
671 default: TPDE_UNREACHABLE("invalid FP reg size");
672 }
673 }
674}
675
676template <IRAdaptor Adaptor,
677 typename Derived,
678 template <typename, typename, typename> class BaseTy,
679 typename Config>
680void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
681 std::variant<SymRef, ValuePart> &&target) noexcept {
682 u32 sub = 0;
683 if (stack_adjust_off != 0) {
684 auto *text_data = this->compiler.text_writer.begin_ptr();
685 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + stack_adjust_off);
686 u32 stack_size = this->assigner.get_stack_size();
687 sub = util::align_up(stack_size, stack_size < 0x1000 ? 0x10 : 0x1000);
688 *write_ptr = de64_SUBxi(DA_SP, DA_SP, sub);
689 } else {
690 assert(this->assigner.get_stack_size() == 0);
691 }
692
693 // For vector registers, only the lowest half is callee-saved. Evict all
694 // value parts larger than 8 bytes.
695 auto fp_regs = RegisterFile::bank_regs(Config::FP_BANK);
696 auto fp_csrs = fp_regs & this->assigner.get_ccinfo().callee_saved_regs;
697 auto used_fp_csrs = fp_csrs & this->compiler.register_file.used;
698 for (auto reg_id : util::BitSetIterator<>{used_fp_csrs}) {
699 Reg reg{reg_id};
700 ValLocalIdx local_idx = this->compiler.register_file.reg_local_idx(reg);
701 auto part = this->compiler.register_file.reg_part(reg);
702 AssignmentPartRef ap{this->compiler.val_assignment(local_idx), part};
703 if (ap.part_size() > 8) {
704 this->compiler.evict(ap);
705 }
706 }
707
708 if (auto *sym = std::get_if<SymRef>(&target)) {
709 ASMC(&this->compiler, BL, 0);
710 this->compiler.reloc_text(
711 *sym, elf::R_AARCH64_CALL26, this->compiler.text_writer.offset() - 4);
712 } else {
713 ValuePart &tvp = std::get<ValuePart>(target);
714 if (tvp.can_salvage()) {
715 ASMC(&this->compiler, BLR, tvp.salvage(&this->compiler));
716 } else {
717 AsmReg reg = this->compiler.permanent_scratch_reg;
718 tvp.reload_into_specific_fixed(&this->compiler, reg);
719 ASMC(&this->compiler, BLR, reg);
720 }
721 tvp.reset(&this->compiler);
722 }
723
724 if (stack_adjust_off != 0) {
725 ASMC(&this->compiler, ADDxi, DA_SP, DA_SP, sub);
726 }
727}
728
729template <IRAdaptor Adaptor,
730 typename Derived,
731 template <typename, typename, typename> typename BaseTy,
732 typename Config>
734 CCAssigner *cc_assigner) noexcept {
735 func_ret_offs.clear();
736 func_start_off = this->text_writer.offset();
737
738 const CCInfo &cc_info = cc_assigner->get_ccinfo();
739
740 // We don't actually generate the prologue here and merely allocate space
741 // for it. Right now, we don't know which callee-saved registers will be
742 // used. While we could pad with nops, we later move the beginning of the
743 // function so that small functions don't have to execute 9 nops.
744 // See finish_func.
745 this->stack.frame_size = 16; // FP, LR
746 {
747 auto csr = cc_info.callee_saved_regs;
748 auto csr_gp = csr & this->register_file.bank_regs(Config::GP_BANK);
749 auto csr_fp = csr & this->register_file.bank_regs(Config::FP_BANK);
750 u32 gp_saves = std::popcount(csr_gp);
751 u32 fp_saves = std::popcount(csr_fp);
752 // LDP/STP can handle two registers of the same bank.
753 u32 reg_save_size = 4 * ((gp_saves + 1) / 2 + (fp_saves + 1) / 2);
754 // TODO: support CSR of Qx/Vx registers, not just Dx
755 this->stack.frame_size += util::align_up(gp_saves * 8 + fp_saves * 8, 16);
756
757 // Reserve space for sub sp, stp x29/x30, and mov x29, sp.
758 func_prologue_alloc = reg_save_size + 12;
759 this->text_writer.ensure_space(func_prologue_alloc);
760 this->text_writer.cur_ptr() += func_prologue_alloc;
761 }
762
763 // TODO(ts): support larger stack alignments?
764
765 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
766 this->stack.frame_used = true;
767 reg_save_frame_off = this->stack.frame_size;
768 // We additionally store a pointer to the stack area, which we can't compute
769 // with a constant offset from the frame pointer. Add 16 bytes to maintain
770 // alignment.
771 this->stack.frame_size += 8 * 8 + 8 * 16 + 16;
772 this->text_writer.ensure_space(4 * 8);
773 ASMNC(STPx, DA_GP(0), DA_GP(1), DA_SP, reg_save_frame_off);
774 ASMNC(STPx, DA_GP(2), DA_GP(3), DA_SP, reg_save_frame_off + 16);
775 ASMNC(STPx, DA_GP(4), DA_GP(5), DA_SP, reg_save_frame_off + 32);
776 ASMNC(STPx, DA_GP(6), DA_GP(7), DA_SP, reg_save_frame_off + 48);
777 ASMNC(STPq, DA_V(0), DA_V(1), DA_SP, reg_save_frame_off + 64);
778 ASMNC(STPq, DA_V(2), DA_V(3), DA_SP, reg_save_frame_off + 96);
779 ASMNC(STPq, DA_V(4), DA_V(5), DA_SP, reg_save_frame_off + 128);
780 ASMNC(STPq, DA_V(6), DA_V(7), DA_SP, reg_save_frame_off + 160);
781 }
782
783 this->func_arg_stack_add_off = ~0u;
784}
785
786template <IRAdaptor Adaptor,
787 typename Derived,
788 template <typename, typename, typename> typename BaseTy,
789 typename Config>
790std::optional<i32>
792 ValuePart &&vp, CCAssignment cca) noexcept {
793 if (cca.reg.valid()) [[likely]] {
794 vp.set_value_reg(this, cca.reg);
795 // Mark register as allocatable as soon as it is assigned. If the argument
796 // is unused, the register will be freed immediately and can be used for
797 // later stack arguments.
798 this->register_file.allocatable |= u64{1} << cca.reg.id();
799 return {};
800 }
801
802 AsmReg dst = vp.alloc_reg(this);
803
804 this->text_writer.ensure_space(8);
805 AsmReg stack_reg = AsmReg::R17;
806 // TODO: allocate an actual scratch register for this.
807 assert(!(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
808 "x17 must not be allocatable");
809 if (this->func_arg_stack_add_off == ~0u) {
810 this->func_arg_stack_add_off = this->text_writer.offset();
811 this->func_arg_stack_add_reg = stack_reg;
812 // Fixed in finish_func when frame size is known
813 ASMNC(ADDxi, stack_reg, DA_SP, 0);
814 }
815
816 if (cca.byval) {
817 ASMNC(ADDxi, dst, stack_reg, cca.stack_off);
818 } else if (cca.bank == Config::GP_BANK) {
819 switch (cca.size) {
820 case 1: ASMNC(LDRBu, dst, stack_reg, cca.stack_off); break;
821 case 2: ASMNC(LDRHu, dst, stack_reg, cca.stack_off); break;
822 case 4: ASMNC(LDRwu, dst, stack_reg, cca.stack_off); break;
823 case 8: ASMNC(LDRxu, dst, stack_reg, cca.stack_off); break;
824 default: TPDE_UNREACHABLE("invalid GP reg size");
825 }
826 } else {
827 assert(cca.bank == Config::FP_BANK);
828 switch (cca.size) {
829 case 1: ASMNC(LDRbu, dst, stack_reg, cca.stack_off); break;
830 case 2: ASMNC(LDRhu, dst, stack_reg, cca.stack_off); break;
831 case 4: ASMNC(LDRsu, dst, stack_reg, cca.stack_off); break;
832 case 8: ASMNC(LDRdu, dst, stack_reg, cca.stack_off); break;
833 case 16: ASMNC(LDRqu, dst, stack_reg, cca.stack_off); break;
834 default: TPDE_UNREACHABLE("invalid FP reg size");
835 }
836 }
837 return {};
838}
839
840template <IRAdaptor Adaptor,
841 typename Derived,
842 template <typename, typename, typename> typename BaseTy,
843 typename Config>
845 CCAssigner *cc_assigner) noexcept {
846 // Hack: we don't know the frame size, so for a va_start(), we cannot easily
847 // compute the offset from the frame pointer. But we have a stack_reg here,
848 // so use it for var args.
849 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
850 this->stack.frame_used = true;
851 AsmReg stack_reg = AsmReg::R17;
852 // TODO: allocate an actual scratch register for this.
853 assert(!(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
854 "x17 must not be allocatable");
855 if (this->func_arg_stack_add_off == ~0u) {
856 this->func_arg_stack_add_off = this->text_writer.offset();
857 this->func_arg_stack_add_reg = stack_reg;
858 // Fixed in finish_func when frame size is known
859 ASMC(this, ADDxi, stack_reg, DA_SP, 0);
860 }
861 ASM(ADDxi, stack_reg, stack_reg, cc_assigner->get_stack_size());
862 ASM(STRxu, stack_reg, DA_GP(29), this->reg_save_frame_off + 192);
863
864 // TODO: extract ngrn/nsrn from CCAssigner
865 // TODO: this isn't quite accurate, e.g. for (i128, i128, i128, i64, i128),
866 // this should be 8 but will end up with 7.
867 const CCInfo &cc_info = cc_assigner->get_ccinfo();
868 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
869 u32 ngrn = 8 - util::cnt_lz<u16>((arg_regs & 0xff) << 8 | 0x80);
870 u32 nsrn = 8 - util::cnt_lz<u16>(((arg_regs >> 32) & 0xff) << 8 | 0x80);
871 this->scalar_arg_count = ngrn;
872 this->vec_arg_count = nsrn;
873 }
874}
875
876template <IRAdaptor Adaptor,
877 typename Derived,
878 template <typename, typename, typename> typename BaseTy,
879 typename Config>
880void CompilerA64<Adaptor, Derived, BaseTy, Config>::finish_func(
881 u32 func_idx) noexcept {
882 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
883 u64 saved_regs = this->register_file.clobbered & csr;
884
885 auto stack_reg = DA_SP;
886 if (this->stack.has_dynamic_alloca) {
887 stack_reg = DA_GP(29);
888 }
889
890 auto final_frame_size = util::align_up(this->stack.frame_size, 16);
891 if (final_frame_size > 4095) {
892 // round up to 4k since SUB cannot encode immediates greater than 4095
893 final_frame_size = util::align_up(final_frame_size, 4096);
894 assert(final_frame_size < 16 * 1024 * 1024);
895 }
896
897 bool needs_stack_frame =
898 this->stack.frame_used || this->stack.generated_call ||
899 this->stack.has_dynamic_alloca || saved_regs != 0 ||
900 (this->register_file.clobbered & (u64{1} << AsmReg::LR));
901
902 this->text_writer.eh_begin_fde(this->get_personality_sym());
903
904 u32 prologue_size = 0;
905 if (needs_stack_frame) [[likely]] {
906 // NB: code alignment factor 4, data alignment factor -8.
907 util::SmallVector<u32, 16> prologue;
908 // For small stack frames, remember the state at the very beginning, which
909 // is identical to the state after the post-increment LDP. For large stack
910 // frames, remember the state after the SP adjustment (encoding the
911 // corresponding DW_def_cfa SP, framesize would be >=3 bytes; this way we
912 // can get away with a DW_def_cfa_offset 0 after the ADD).
913 if (!func_ret_offs.empty() && final_frame_size <= 0x1f8) {
914 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
915 }
916 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
917 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset,
918 final_frame_size);
919 if (final_frame_size <= 0x1f8) {
920 prologue.push_back(
921 de64_STPx_pre(DA_GP(29), DA_GP(30), DA_SP, -int(final_frame_size)));
922 prologue.push_back(de64_MOV_SPx(DA_GP(29), DA_SP));
923 } else {
924 if (!func_ret_offs.empty()) {
925 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
926 }
927 prologue.push_back(de64_SUBxi(DA_SP, DA_SP, final_frame_size));
928 prologue.push_back(de64_STPx(DA_GP(29), DA_GP(30), DA_SP, 0));
929 prologue.push_back(de64_MOV_SPx(DA_GP(29), DA_SP));
930 }
931
932 // Patched below
933 auto fde_prologue_adv_off = this->text_writer.eh_writer.size();
934 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
935 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
936 dwarf::a64::DW_reg_fp);
937 this->text_writer.eh_write_inst(
938 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_fp, final_frame_size / 8);
939 this->text_writer.eh_write_inst(
940 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_lr, final_frame_size / 8 - 1);
941
942 AsmReg last_reg = AsmReg::make_invalid();
943 u32 frame_off = 16;
944 for (auto reg : util::BitSetIterator{saved_regs}) {
945 u8 dwarf_base = reg < 32 ? dwarf::a64::DW_reg_x0 : dwarf::a64::DW_reg_v0;
946 u8 dwarf_reg = dwarf_base + reg % 32;
947 u32 cfa_off = (final_frame_size - frame_off) / 8 - last_reg.valid();
948 if ((dwarf_reg & dwarf::DWARF_CFI_PRIMARY_OPCODE_MASK) == 0) {
949 this->text_writer.eh_write_inst(
950 dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
951 } else {
952 this->text_writer.eh_write_inst(
953 dwarf::DW_CFA_offset_extended, dwarf_reg, cfa_off);
954 }
955
956 if (last_reg.valid()) {
957 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
958 const auto last_bank = this->register_file.reg_bank(last_reg);
959 if (reg_bank == last_bank) {
960 if (reg_bank == Config::GP_BANK) {
961 prologue.push_back(
962 de64_STPx(last_reg, AsmReg{reg}, stack_reg, frame_off));
963 } else {
964 prologue.push_back(
965 de64_STPd(last_reg, AsmReg{reg}, stack_reg, frame_off));
966 }
967 frame_off += 16;
968 last_reg = AsmReg::make_invalid();
969 } else {
970 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
971 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
972 frame_off += 8;
973 last_reg = AsmReg{reg};
974 }
975 } else {
976 last_reg = AsmReg{reg};
977 }
978 }
979
980 if (last_reg.valid()) {
981 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
982 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
983 } else {
984 assert(this->register_file.reg_bank(last_reg) == Config::FP_BANK);
985 prologue.push_back(de64_STRdu(last_reg, stack_reg, frame_off));
986 }
987 }
988
989 assert(prologue.size() * sizeof(u32) <= func_prologue_alloc);
990
991 assert(prologue.size() < 0x4c);
992 this->text_writer.eh_writer.data()[fde_prologue_adv_off] =
993 dwarf::DW_CFA_advance_loc | (prologue.size() - 1);
994
995 std::memcpy(this->text_writer.begin_ptr() + func_start_off,
996 prologue.data(),
997 prologue.size() * sizeof(u32));
998
999 prologue_size = prologue.size() * sizeof(u32);
1000 }
1001
1002 if (func_arg_stack_add_off != ~0u) {
1003 auto *raw_inst_ptr = this->text_writer.begin_ptr() + func_arg_stack_add_off;
1004 u32 *inst_ptr = reinterpret_cast<u32 *>(raw_inst_ptr);
1005 if (needs_stack_frame) {
1006 *inst_ptr = de64_ADDxi(func_arg_stack_add_reg, DA_SP, final_frame_size);
1007 } else {
1008 *inst_ptr = de64_MOV_SPx(func_arg_stack_add_reg, DA_SP);
1009 }
1010 }
1011
1012 if (!func_ret_offs.empty()) {
1013 u8 *text_data = this->text_writer.begin_ptr();
1014 if (func_ret_offs.back() == this->text_writer.offset() - 4) {
1015 this->text_writer.cur_ptr() -= 4;
1016 func_ret_offs.pop_back();
1017 }
1018 for (auto ret_off : func_ret_offs) {
1019 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + ret_off);
1020 *write_ptr = de64_B((this->text_writer.offset() - ret_off) / 4);
1021 }
1022
1023 // Epilogue mirrors prologue + RET
1024 this->text_writer.ensure_space(prologue_size + 4);
1025
1026 if (this->stack.has_dynamic_alloca) {
1027 ASMNC(MOV_SPx, DA_SP, DA_GP(29));
1028 }
1029
1030 AsmReg last_reg = AsmReg::make_invalid();
1031 u32 frame_off = 16;
1032 for (auto reg : util::BitSetIterator{saved_regs}) {
1033 if (last_reg.valid()) {
1034 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
1035 const auto last_bank = this->register_file.reg_bank(last_reg);
1036 if (reg_bank == last_bank) {
1037 if (reg_bank == Config::GP_BANK) {
1038 ASMNC(LDPx, last_reg, AsmReg{reg}, stack_reg, frame_off);
1039 } else {
1040 ASMNC(LDPd, last_reg, AsmReg{reg}, stack_reg, frame_off);
1041 }
1042 frame_off += 16;
1043 last_reg = AsmReg::make_invalid();
1044 } else {
1045 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
1046 ASMNC(LDRxu, last_reg, stack_reg, frame_off);
1047 frame_off += 8;
1048 last_reg = AsmReg{reg};
1049 }
1050 continue;
1051 }
1052
1053 last_reg = AsmReg{reg};
1054 }
1055
1056 if (last_reg.valid()) {
1057 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
1058 ASMNC(LDRxu, last_reg, stack_reg, frame_off);
1059 } else {
1060 ASMNC(LDRdu, last_reg, stack_reg, frame_off);
1061 }
1062 }
1063 if (needs_stack_frame) {
1064 u32 body_start = func_start_off + func_prologue_alloc;
1065 this->text_writer.eh_advance(this->text_writer.offset() - body_start + 4);
1066 this->text_writer.eh_write_inst(dwarf::DW_CFA_restore_state);
1067 if (final_frame_size <= 0x1f8) {
1068 ASMNC(LDPx_post, DA_GP(29), DA_GP(30), DA_SP, final_frame_size);
1069 // CFI is correct here.
1070 } else {
1071 ASMNC(LDPx, DA_GP(29), DA_GP(30), DA_SP, 0);
1072 // CFI is correct here, but we need to update the CFA after the ADD.
1073 ASMNC(ADDxi, DA_SP, DA_SP, final_frame_size);
1074 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
1075 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 0);
1076 }
1077 }
1078
1079 ASMNC(RET, DA_GP(30));
1080 }
1081
1082 // TODO(ts): honor cur_needs_unwind_info
1083 this->text_writer.remove_prologue_bytes(func_start_off + prologue_size,
1084 func_prologue_alloc - prologue_size);
1085 auto func_size = this->text_writer.offset() - func_start_off;
1086 auto func_sym = this->func_syms[func_idx];
1087 auto func_sec = this->text_writer.get_sec_ref();
1088 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
1089 this->text_writer.eh_end_fde();
1090 this->text_writer.except_encode_func();
1091}
1092
1093template <IRAdaptor Adaptor,
1094 typename Derived,
1095 template <typename, typename, typename> typename BaseTy,
1096 typename Config>
1097void CompilerA64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
1098 func_ret_offs.clear();
1099 Base::reset();
1100}
1101
1102template <IRAdaptor Adaptor,
1103 typename Derived,
1104 template <typename, typename, typename> typename BaseTy,
1105 typename Config>
1106void CompilerA64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
1107 // Patched at the end, just reserve the space here.
1108 func_ret_offs.push_back(this->text_writer.offset());
1109 this->text_writer.ensure_space(4); // Single branch to actual epilogue.
1110 this->text_writer.cur_ptr() += 4;
1111}
1112
1113template <IRAdaptor Adaptor,
1114 typename Derived,
1115 template <typename, typename, typename> typename BaseTy,
1116 typename Config>
1117void CompilerA64<Adaptor, Derived, BaseTy, Config>::spill_reg(
1118 const AsmReg reg, const u32 frame_off, const u32 size) noexcept {
1119 assert(this->stack.frame_used);
1120 assert((size & (size - 1)) == 0);
1121 assert(util::align_up(frame_off, size) == frame_off);
1122 // We don't support stack frames that aren't encodeable with add/sub.
1123 assert(frame_off < 0x1'000'000);
1124 this->text_writer.ensure_space(8);
1125
1126 u32 off = frame_off;
1127 auto addr_base = AsmReg{AsmReg::FP};
1128 if (off >= 0x1000 * size) [[unlikely]] {
1129 // We cannot encode the offset in the store instruction.
1130 ASMNC(ADDxi, permanent_scratch_reg, DA_GP(29), off & ~0xfff);
1131 off &= 0xfff;
1132 addr_base = permanent_scratch_reg;
1133 }
1134
1135 assert(-static_cast<i32>(frame_off) < 0);
1136 if (reg.id() <= AsmReg::R30) {
1137 switch (size) {
1138 case 1: ASMNC(STRBu, reg, addr_base, off); break;
1139 case 2: ASMNC(STRHu, reg, addr_base, off); break;
1140 case 4: ASMNC(STRwu, reg, addr_base, off); break;
1141 case 8: ASMNC(STRxu, reg, addr_base, off); break;
1142 default: TPDE_UNREACHABLE("invalid register spill size");
1143 }
1144 } else {
1145 switch (size) {
1146 case 1: ASMNC(STRbu, reg, addr_base, off); break;
1147 case 2: ASMNC(STRhu, reg, addr_base, off); break;
1148 case 4: ASMNC(STRsu, reg, addr_base, off); break;
1149 case 8: ASMNC(STRdu, reg, addr_base, off); break;
1150 case 16: ASMNC(STRqu, reg, addr_base, off); break;
1151 default: TPDE_UNREACHABLE("invalid register spill size");
1152 }
1153 }
1154}
1155
1156template <IRAdaptor Adaptor,
1157 typename Derived,
1158 template <typename, typename, typename> typename BaseTy,
1159 typename Config>
1160void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
1161 const AsmReg dst,
1162 const i32 frame_off,
1163 const u32 size,
1164 const bool sign_extend) noexcept {
1165 assert(this->stack.frame_used);
1166 assert((size & (size - 1)) == 0);
1167 assert(util::align_up(frame_off, size) == frame_off);
1168 // We don't support stack frames that aren't encodeable with add/sub.
1169 assert(frame_off >= 0 && frame_off < 0x1'000'000);
1170 this->text_writer.ensure_space(8);
1171
1172 u32 off = frame_off;
1173 auto addr_base = AsmReg{AsmReg::FP};
1174 if (off >= 0x1000 * size) [[unlikely]] {
1175 // need to calculate this explicitly
1176 addr_base = dst.id() <= AsmReg::R30 ? dst : permanent_scratch_reg;
1177 ASMNC(ADDxi, addr_base, DA_GP(29), off & ~0xfff);
1178 off &= 0xfff;
1179 }
1180
1181 if (dst.id() <= AsmReg::R30) {
1182 if (!sign_extend) {
1183 switch (size) {
1184 case 1: ASMNC(LDRBu, dst, addr_base, off); break;
1185 case 2: ASMNC(LDRHu, dst, addr_base, off); break;
1186 case 4: ASMNC(LDRwu, dst, addr_base, off); break;
1187 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1188 default: TPDE_UNREACHABLE("invalid register spill size");
1189 }
1190 } else {
1191 switch (size) {
1192 case 1: ASMNC(LDRSBwu, dst, addr_base, off); break;
1193 case 2: ASMNC(LDRSHwu, dst, addr_base, off); break;
1194 case 4: ASMNC(LDRSWxu, dst, addr_base, off); break;
1195 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1196 default: TPDE_UNREACHABLE("invalid register spill size");
1197 }
1198 }
1199 return;
1200 }
1201
1202 assert(!sign_extend);
1203
1204 switch (size) {
1205 case 1: ASMNC(LDRbu, dst, addr_base, off); break;
1206 case 2: ASMNC(LDRhu, dst, addr_base, off); break;
1207 case 4: ASMNC(LDRsu, dst, addr_base, off); break;
1208 case 8: ASMNC(LDRdu, dst, addr_base, off); break;
1209 case 16: ASMNC(LDRqu, dst, addr_base, off); break;
1210 default: TPDE_UNREACHABLE("invalid register spill size");
1211 }
1212}
1213
1214template <IRAdaptor Adaptor,
1215 typename Derived,
1216 template <typename, typename, typename> typename BaseTy,
1217 typename Config>
1218void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
1219 const AsmReg dst, const AssignmentPartRef ap) noexcept {
1220 assert(this->stack.frame_used);
1221 auto frame_off = ap.variable_stack_off();
1222 assert(frame_off >= 0);
1223 if (!ASMIF(ADDxi, dst, DA_GP(29), frame_off)) {
1224 materialize_constant(frame_off, Config::GP_BANK, 4, dst);
1225 ASM(ADDx_uxtw, dst, DA_GP(29), dst, 0);
1226 }
1227}
1228
1229template <IRAdaptor Adaptor,
1230 typename Derived,
1231 template <typename, typename, typename> typename BaseTy,
1232 typename Config>
1233void CompilerA64<Adaptor, Derived, BaseTy, Config>::mov(
1234 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
1235 this->text_writer.ensure_space(4);
1236 assert(dst.valid());
1237 assert(src.valid());
1238 if (dst.id() <= AsmReg::SP && src.id() <= AsmReg::SP) {
1239 assert(dst.id() != AsmReg::SP && src.id() != AsmReg::SP);
1240 if (size > 4) {
1241 ASMNC(MOVx, dst, src);
1242 } else {
1243 ASMNC(MOVw, dst, src);
1244 }
1245 } else if (dst.id() >= AsmReg::V0 && src.id() >= AsmReg::V0) {
1246 ASMNC(ORR16b, dst, src, src);
1247 } else if (dst.id() <= AsmReg::SP) {
1248 assert(dst.id() != AsmReg::SP);
1249 // gp<-vector
1250 assert(src.id() >= AsmReg::V0);
1251 assert(size <= 8);
1252 if (size <= 4) {
1253 ASMNC(FMOVws, dst, src);
1254 } else {
1255 ASMNC(FMOVxd, dst, src);
1256 }
1257 } else {
1258 // vector<-gp
1259 assert(src.id() <= AsmReg::R30);
1260 assert(dst.id() >= AsmReg::V0);
1261 assert(size <= 8);
1262 if (size <= 4) {
1263 ASMNC(FMOVsw, dst, src);
1264 } else {
1265 ASMNC(FMOVdx, dst, src);
1266 }
1267 }
1268}
1269
1270template <IRAdaptor Adaptor,
1271 typename Derived,
1272 template <typename, typename, typename> typename BaseTy,
1273 typename Config>
1274AsmReg CompilerA64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1275 GenericValuePart &gv) noexcept {
1276 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1277
1278 ScratchReg scratch{derived()};
1279 if (!expr.has_base() && !expr.has_index()) {
1280 AsmReg dst = scratch.alloc_gp();
1281 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, dst);
1282 expr.disp = 0;
1283 } else if (!expr.has_base() && expr.has_index()) {
1284 AsmReg index_reg = expr.index_reg();
1285 if (std::holds_alternative<ScratchReg>(expr.index)) {
1286 scratch = std::move(std::get<ScratchReg>(expr.index));
1287 } else {
1288 (void)scratch.alloc_gp();
1289 }
1290 AsmReg dst = scratch.cur_reg();
1291 if ((expr.scale & (expr.scale - 1)) == 0) {
1292 const auto shift = util::cnt_tz<u64>(expr.scale);
1293 ASM(LSLxi, dst, index_reg, shift);
1294 } else {
1295 AsmReg tmp2 = permanent_scratch_reg;
1296 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1297 ASM(MULx, dst, index_reg, tmp2);
1298 }
1299 } else if (expr.has_base() && expr.has_index()) {
1300 AsmReg base_reg = expr.base_reg();
1301 AsmReg index_reg = expr.index_reg();
1302 if (std::holds_alternative<ScratchReg>(expr.base)) {
1303 scratch = std::move(std::get<ScratchReg>(expr.base));
1304 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1305 scratch = std::move(std::get<ScratchReg>(expr.index));
1306 } else {
1307 (void)scratch.alloc_gp();
1308 }
1309 AsmReg dst = scratch.cur_reg();
1310 if ((expr.scale & (expr.scale - 1)) == 0) {
1311 const auto shift = util::cnt_tz<u64>(expr.scale);
1312 ASM(ADDx_lsl, dst, base_reg, index_reg, shift);
1313 } else {
1314 AsmReg tmp2 = permanent_scratch_reg;
1315 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1316 ASM(MADDx, dst, index_reg, tmp2, base_reg);
1317 }
1318 } else if (expr.has_base() && !expr.has_index()) {
1319 AsmReg base_reg = expr.base_reg();
1320 if (std::holds_alternative<ScratchReg>(expr.base)) {
1321 scratch = std::move(std::get<ScratchReg>(expr.base));
1322 } else {
1323 (void)scratch.alloc_gp();
1324 }
1325 AsmReg dst = scratch.cur_reg();
1326 if (expr.disp != 0 && ASMIF(ADDxi, dst, base_reg, expr.disp)) {
1327 expr.disp = 0;
1328 } else if (dst != base_reg) {
1329 ASM(MOVx, dst, base_reg);
1330 }
1331 } else {
1332 TPDE_UNREACHABLE("inconsistent GenericValuePart::Expr");
1333 }
1334
1335 AsmReg dst = scratch.cur_reg();
1336 if (expr.disp != 0) {
1337 if (!ASMIF(ADDxi, dst, dst, expr.disp)) {
1338 AsmReg tmp2 = permanent_scratch_reg;
1339 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, tmp2);
1340 ASM(ADDx, dst, dst, tmp2);
1341 }
1342 }
1343
1344 gv.state = std::move(scratch);
1345 return dst;
1346}
1347
1348template <IRAdaptor Adaptor,
1349 typename Derived,
1350 template <typename, typename, typename> typename BaseTy,
1351 typename Config>
1353 u64 size, u32 align, ValuePart &res) noexcept {
1354 assert(this->stack.has_dynamic_alloca &&
1355 "function marked as not having dynamic allocas can't have alloca");
1356 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1357 size = tpde::util::align_up(size, 16);
1358 AsmReg res_reg = res.alloc_reg(this);
1359 if (size >= 0x10'0000) {
1360 auto tmp = permanent_scratch_reg;
1361 materialize_constant(size, Config::GP_BANK, 8, tmp);
1362 ASM(SUBx_uxtx, res_reg, DA_SP, tmp, 0);
1363 } else if (size >= 0x1000) {
1364 ASM(SUBxi, res_reg, DA_SP, size & 0xff'f000);
1365 ASM(SUBxi, res_reg, res_reg, size & 0xfff);
1366 } else {
1367 ASM(SUBxi, res_reg, DA_SP, size & 0xfff);
1368 }
1369
1370 if (align > 16) {
1371 // The stack pointer is always at least 16-byte aligned.
1372 ASM(ANDxi, res_reg, res_reg, ~(u64{align} - 1));
1373 }
1374
1375 if (size > 0) {
1376 ASM(MOV_SPx, DA_SP, res_reg);
1377 }
1378}
1379
1380template <IRAdaptor Adaptor,
1381 typename Derived,
1382 template <typename, typename, typename> typename BaseTy,
1383 typename Config>
1385 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept {
1386 assert(this->stack.has_dynamic_alloca &&
1387 "function marked as not having dynamic allocas can't have alloca");
1388 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1389 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(this);
1390 AsmReg res_reg = res.alloc_try_reuse(this, count);
1391
1392 if (elem_size == 0) {
1393 ASM(MOVZw, res_reg, 0);
1394 } else if ((elem_size & (elem_size - 1)) == 0) {
1395 const auto shift = util::cnt_tz(elem_size);
1396 if (shift <= 4) {
1397 ASM(SUBx_uxtx, res_reg, DA_SP, size_reg, shift);
1398 } else {
1399 ASM(LSLxi, res_reg, size_reg, shift);
1400 ASM(SUBx_uxtx, res_reg, DA_SP, res_reg, 0);
1401 }
1402 } else {
1403 auto tmp = permanent_scratch_reg;
1404 materialize_constant(elem_size, Config::GP_BANK, 8, tmp);
1405 ASM(MULx, res_reg, size_reg, tmp);
1406 ASM(SUBx_uxtx, res_reg, DA_SP, res_reg, 0);
1407 }
1408
1409 align = align > 16 ? align : 16;
1410 if (elem_size & (align - 1)) {
1411 ASM(ANDxi, res_reg, res_reg, ~(u64{align} - 1));
1412 }
1413
1414 ASM(MOV_SPx, DA_SP, res_reg);
1415}
1416
1417template <IRAdaptor Adaptor,
1418 typename Derived,
1419 template <typename, typename, typename> typename BaseTy,
1420 typename Config>
1422 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1423 this->text_writer.ensure_space(5 * 4);
1424
1425 const auto const_u64 = data[0];
1426 if (bank == Config::GP_BANK) {
1427 assert(size <= 8);
1428 if (const_u64 == 0) {
1429 ASMNC(MOVZw, dst, 0);
1430 return;
1431 }
1432
1433 this->text_writer.cur_ptr() +=
1434 sizeof(u32) *
1435 de64_MOVconst(reinterpret_cast<u32 *>(this->text_writer.cur_ptr()),
1436 dst,
1437 const_u64);
1438 return;
1439 }
1440
1441 assert(bank == Config::FP_BANK);
1442 // Try instructions that take an immediate
1443 if (size == 4) {
1444 if (ASMIF(FMOVsi, dst, std::bit_cast<float>((u32)const_u64))) {
1445 return;
1446 } else if (ASMIF(MOVId, dst, static_cast<u32>(const_u64))) {
1447 return;
1448 }
1449 } else if (size == 8) {
1450 if (ASMIF(FMOVdi, dst, std::bit_cast<double>(const_u64))) {
1451 return;
1452 } else if (ASMIF(MOVId, dst, const_u64)) {
1453 return;
1454 }
1455 } else if (size == 16) {
1456 const auto high_u64 = data[1];
1457 if (const_u64 == high_u64 && ASMIF(MOVI2d, dst, const_u64)) {
1458 return;
1459 } else if (high_u64 == 0 && ASMIF(MOVId, dst, const_u64)) {
1460 return;
1461 }
1462 }
1463
1464 // We must either load through a GP register of from memory. Both cases need a
1465 // GP register in the common case. We reserve x16/x17 for cases like this.
1466 if (size <= 16) {
1467 this->register_file.mark_clobbered(permanent_scratch_reg);
1468 // Copy from a GP register
1469 // TODO: always load from memory?
1470 if (size <= 8) {
1471 materialize_constant(data, Config::GP_BANK, size, permanent_scratch_reg);
1472 if (size <= 4) {
1473 ASMNC(FMOVsw, dst, permanent_scratch_reg);
1474 } else {
1475 ASMNC(FMOVdx, dst, permanent_scratch_reg);
1476 }
1477 return;
1478 }
1479
1480 auto rodata = this->assembler.get_data_section(true, false);
1481 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), size};
1482 auto sym = this->assembler.sym_def_data(
1483 rodata, "", raw_data, 16, Assembler::SymBinding::LOCAL);
1484 this->text_writer.ensure_space(8); // ensure contiguous instructions
1485 this->reloc_text(
1486 sym, elf::R_AARCH64_ADR_PREL_PG_HI21, this->text_writer.offset(), 0);
1487 ASMNC(ADRP, permanent_scratch_reg, 0, 0);
1488 this->reloc_text(
1489 sym, elf::R_AARCH64_LDST128_ABS_LO12_NC, this->text_writer.offset(), 0);
1490 ASMNC(LDRqu, dst, permanent_scratch_reg, 0);
1491 return;
1492 }
1493
1494 TPDE_FATAL("unable to materialize constant");
1495}
1496
1497template <IRAdaptor Adaptor,
1498 typename Derived,
1499 template <typename, typename, typename> typename BaseTy,
1500 typename Config>
1501AsmReg
1502 CompilerA64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1503 AssignmentPartRef ap, IRValueRef) noexcept {
1504 RegBank bank = ap.bank();
1505 if (bank == Config::FP_BANK && ap.part_size() > 8) {
1506 // FP registers can not in general be fixed registers, as only the lowest 8
1507 // bytes are callee-saved.
1508 return AsmReg::make_invalid();
1509 }
1510
1511 // TODO(ts): why is this in here?
1512 assert(bank.id() <= Config::NUM_BANKS);
1513 auto reg_mask = this->register_file.bank_regs(bank);
1514 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1515
1516 const auto find_possible_regs = [this,
1517 reg_mask](const u64 preferred_regs) -> u64 {
1518 // try to first get an unused reg, otherwise an unfixed reg
1519 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1520 return free_regs & preferred_regs & reg_mask;
1521 };
1522
1523 u64 possible_regs;
1524 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1525 if (!this->stack.is_leaf_function) {
1526 // we can only allocated fixed assignments from the callee-saved regs
1527 possible_regs = find_possible_regs(csr);
1528 } else {
1529 // try allocating any non-callee saved register first, except the result
1530 // registers
1531 possible_regs = find_possible_regs(~csr);
1532 if (possible_regs == 0) {
1533 // otherwise fallback to callee-saved regs
1534 possible_regs = find_possible_regs(csr);
1535 }
1536 }
1537
1538 if (possible_regs == 0) {
1539 return AsmReg::make_invalid();
1540 }
1541
1542 // try to first get an unused reg, otherwise an unfixed reg
1543 if ((possible_regs & ~this->register_file.used) != 0) {
1544 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1545 }
1546
1547 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1548 const auto reg = AsmReg{reg_id};
1549
1550 assert(!this->register_file.is_fixed(reg));
1551
1552 const auto local_idx = this->register_file.reg_local_idx(reg);
1553 const auto part = this->register_file.reg_part(reg);
1554 assert(local_idx != Base::INVALID_VAL_LOCAL_IDX);
1555
1556 auto *assignment = this->val_assignment(local_idx);
1557 auto ap = AssignmentPartRef{assignment, part};
1558 if (ap.modified()) {
1559 continue;
1560 }
1561
1562 return reg;
1563 }
1564
1565 return AsmReg::make_invalid();
1566}
1567
1568template <IRAdaptor Adaptor,
1569 typename Derived,
1570 template <typename, typename, typename> class BaseTy,
1571 typename Config>
1572typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1573 CompilerA64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1574 Jump jmp) noexcept {
1575 switch (jmp.kind) {
1576 case Jump::Jeq: return jmp.change_kind(Jump::Jne);
1577 case Jump::Jne: return jmp.change_kind(Jump::Jeq);
1578 case Jump::Jcs: return jmp.change_kind(Jump::Jcc);
1579 case Jump::Jcc: return jmp.change_kind(Jump::Jcs);
1580 case Jump::Jmi: return jmp.change_kind(Jump::Jpl);
1581 case Jump::Jpl: return jmp.change_kind(Jump::Jmi);
1582 case Jump::Jvs: return jmp.change_kind(Jump::Jvc);
1583 case Jump::Jvc: return jmp.change_kind(Jump::Jvs);
1584 case Jump::Jhi: return jmp.change_kind(Jump::Jls);
1585 case Jump::Jls: return jmp.change_kind(Jump::Jhi);
1586 case Jump::Jge: return jmp.change_kind(Jump::Jlt);
1587 case Jump::Jlt: return jmp.change_kind(Jump::Jge);
1588 case Jump::Jgt: return jmp.change_kind(Jump::Jle);
1589 case Jump::Jle: return jmp.change_kind(Jump::Jgt);
1590 case Jump::jmp: return jmp;
1591 case Jump::Cbz: return jmp.change_kind(Jump::Cbnz);
1592 case Jump::Cbnz: return jmp.change_kind(Jump::Cbz);
1593 case Jump::Tbz: return jmp.change_kind(Jump::Tbnz);
1594 case Jump::Tbnz: return jmp.change_kind(Jump::Tbz);
1595 default: TPDE_UNREACHABLE("invalid jump kind");
1596 }
1597}
1598
1599template <IRAdaptor Adaptor,
1600 typename Derived,
1601 template <typename, typename, typename> typename BaseTy,
1602 typename Config>
1603typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1604 CompilerA64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1605 Jump jmp) noexcept {
1606 switch (jmp.kind) {
1607 case Jump::Jeq: return jmp.change_kind(Jump::Jeq);
1608 case Jump::Jne: return jmp.change_kind(Jump::Jne);
1609 case Jump::Jcc: return jmp.change_kind(Jump::Jhi);
1610 case Jump::Jcs: return jmp.change_kind(Jump::Jls);
1611 case Jump::Jhi: return jmp.change_kind(Jump::Jcc);
1612 case Jump::Jls: return jmp.change_kind(Jump::Jcs);
1613 case Jump::Jge: return jmp.change_kind(Jump::Jle);
1614 case Jump::Jlt: return jmp.change_kind(Jump::Jgt);
1615 case Jump::Jgt: return jmp.change_kind(Jump::Jlt);
1616 case Jump::Jle: return jmp.change_kind(Jump::Jge);
1617 case Jump::jmp: return jmp;
1618 case Jump::Jmi:
1619 case Jump::Jpl:
1620 case Jump::Jvs:
1621 case Jump::Jvc:
1622 case Jump::Cbz:
1623 case Jump::Cbnz:
1624 case Jump::Tbz:
1625 case Jump::Tbnz:
1626 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1627 }
1628}
1629
1630template <IRAdaptor Adaptor,
1631 typename Derived,
1632 template <typename, typename, typename> typename BaseTy,
1633 typename Config>
1635 Jump jmp, Label target_label) noexcept {
1636 const auto is_pending = this->text_writer.label_is_pending(target_label);
1637 this->text_writer.ensure_space(4);
1638 if (jmp.kind == Jump::jmp) {
1639 if (is_pending) {
1640 ASMNC(B, 0);
1641 this->text_writer.label_ref(target_label,
1642 this->text_writer.offset() - 4,
1643 LabelFixupKind::AARCH64_BR);
1644 } else {
1645 const auto label_off = this->text_writer.label_offset(target_label);
1646 const auto cur_off = this->text_writer.offset();
1647 assert(cur_off >= label_off);
1648 const auto diff = cur_off - label_off;
1649 assert((diff & 0b11) == 0);
1650 assert(diff < 128 * 1024 * 1024);
1651
1652 ASMNC(B, -static_cast<ptrdiff_t>(diff) / 4);
1653 }
1654 return;
1655 }
1656
1657 if (jmp.kind == Jump::Cbz || jmp.kind == Jump::Cbnz) {
1658 u32 off = 0;
1659 if (!is_pending) {
1660 const auto label_off = this->text_writer.label_offset(target_label);
1661 const auto cur_off = this->text_writer.offset();
1662 assert(cur_off >= label_off);
1663 off = cur_off - label_off;
1664 assert((off & 0b11) == 0);
1665 assert(off < 128 * 1024 * 1024);
1666 }
1667
1668 if (off <= 1024 * 1024) {
1669 auto imm19 = -static_cast<ptrdiff_t>(off) / 4;
1670 if (jmp.kind == Jump::Cbz) {
1671 if (jmp.cmp_is_32) {
1672 ASMNC(CBZw, jmp.cmp_reg, imm19);
1673 } else {
1674 ASMNC(CBZx, jmp.cmp_reg, imm19);
1675 }
1676 } else {
1677 if (jmp.cmp_is_32) {
1678 ASMNC(CBNZw, jmp.cmp_reg, imm19);
1679 } else {
1680 ASMNC(CBNZx, jmp.cmp_reg, imm19);
1681 }
1682 }
1683
1684 if (is_pending) {
1685 this->text_writer.label_ref(target_label,
1686 this->text_writer.offset() - 4,
1687 LabelFixupKind::AARCH64_COND_BR);
1688 }
1689 } else {
1690 assert(!is_pending);
1691 this->text_writer.ensure_space(2 * 4);
1692
1693 if (jmp.kind == Jump::Cbz) {
1694 if (jmp.cmp_is_32) { // need to jump over 2 instructions
1695 ASMNC(CBNZw, jmp.cmp_reg, 2);
1696 } else {
1697 ASMNC(CBNZx, jmp.cmp_reg, 2);
1698 }
1699 } else {
1700 if (jmp.cmp_is_32) {
1701 ASMNC(CBZw, jmp.cmp_reg, 2);
1702 } else {
1703 ASMNC(CBZx, jmp.cmp_reg, 2);
1704 }
1705 }
1706 // + 4 since we already wrote the cb(n)z instruction
1707 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1708 }
1709 return;
1710 }
1711
1712 if (jmp.kind == Jump::Tbz || jmp.kind == Jump::Tbnz) {
1713 u32 off = 0;
1714 if (!is_pending) {
1715 const auto label_off = this->text_writer.label_offset(target_label);
1716 const auto cur_off = this->text_writer.offset();
1717 assert(cur_off >= label_off);
1718 off = cur_off - label_off;
1719 assert((off & 0b11) == 0);
1720 assert(off < 128 * 1024 * 1024);
1721 }
1722
1723 if (off <= 32 * 1024) {
1724 auto imm14 = -static_cast<ptrdiff_t>(off) / 4;
1725 if (jmp.kind == Jump::Tbz) {
1726 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, imm14);
1727 } else {
1728 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, imm14);
1729 }
1730
1731 if (is_pending) {
1732 this->text_writer.label_ref(target_label,
1733 this->text_writer.offset() - 4,
1734 LabelFixupKind::AARCH64_TEST_BR);
1735 }
1736 } else {
1737 assert(!is_pending);
1738 this->text_writer.ensure_space(2 * 4);
1739
1740 if (jmp.kind == Jump::Tbz) {
1741 // need to jump over 2 instructions
1742 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, 2);
1743 } else {
1744 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, 2);
1745 }
1746 // + 4 since we already wrote the tb(n)z instruction
1747 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1748 }
1749 return;
1750 }
1751
1752 Da64Cond cond, cond_compl;
1753 switch (jmp.kind) {
1754 case Jump::Jeq:
1755 cond = DA_EQ;
1756 cond_compl = DA_NE;
1757 break;
1758 case Jump::Jne:
1759 cond = DA_NE;
1760 cond_compl = DA_EQ;
1761 break;
1762 case Jump::Jcs:
1763 cond = DA_CS;
1764 cond_compl = DA_CC;
1765 break;
1766 case Jump::Jcc:
1767 cond = DA_CC;
1768 cond_compl = DA_CS;
1769 break;
1770 case Jump::Jmi:
1771 cond = DA_MI;
1772 cond_compl = DA_PL;
1773 break;
1774 case Jump::Jpl:
1775 cond = DA_PL;
1776 cond_compl = DA_MI;
1777 break;
1778 case Jump::Jvs:
1779 cond = DA_VS;
1780 cond_compl = DA_VC;
1781 break;
1782 case Jump::Jvc:
1783 cond = DA_VC;
1784 cond_compl = DA_VS;
1785 break;
1786 case Jump::Jhi:
1787 cond = DA_HI;
1788 cond_compl = DA_LS;
1789 break;
1790 case Jump::Jls:
1791 cond = DA_LS;
1792 cond_compl = DA_HI;
1793 break;
1794 case Jump::Jge:
1795 cond = DA_GE;
1796 cond_compl = DA_LT;
1797 break;
1798 case Jump::Jlt:
1799 cond = DA_LT;
1800 cond_compl = DA_GE;
1801 break;
1802 case Jump::Jgt:
1803 cond = DA_GT;
1804 cond_compl = DA_LE;
1805 break;
1806 case Jump::Jle:
1807 cond = DA_LE;
1808 cond_compl = DA_GT;
1809 break;
1810 default: TPDE_UNREACHABLE("invalid jump kind");
1811 }
1812
1813
1814 u32 off = 0;
1815 if (!is_pending) {
1816 const auto label_off = this->text_writer.label_offset(target_label);
1817 const auto cur_off = this->text_writer.offset();
1818 assert(cur_off >= label_off);
1819 off = cur_off - label_off;
1820 assert((off & 0b11) == 0);
1821 assert(off < 128 * 1024 * 1024);
1822 }
1823
1824 if (off <= 1024 * 1024) {
1825 ASMNC(BCOND, cond, -static_cast<ptrdiff_t>(off) / 4);
1826
1827 if (is_pending) {
1828 this->text_writer.label_ref(target_label,
1829 this->text_writer.offset() - 4,
1830 LabelFixupKind::AARCH64_COND_BR);
1831 }
1832 } else {
1833 assert(!is_pending);
1834 this->text_writer.ensure_space(2 * 4);
1835
1836 // 2 to skip over the branch following
1837 ASMNC(BCOND, cond_compl, 2);
1838 // + 4 since we already wrote the branch instruction
1839 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1840 }
1841}
1842template <IRAdaptor Adaptor,
1843 typename Derived,
1844 template <typename, typename, typename> class BaseTy,
1845 typename Config>
1847 Jump jmp) noexcept {
1848 switch (jmp.kind) {
1849 case Jump::Jeq: return DA_EQ;
1850 case Jump::Jne: return DA_NE;
1851 case Jump::Jcs: return DA_CS;
1852 case Jump::Jcc: return DA_CC;
1853 case Jump::Jmi: return DA_MI;
1854 case Jump::Jpl: return DA_PL;
1855 case Jump::Jvs: return DA_VS;
1856 case Jump::Jvc: return DA_VC;
1857 case Jump::Jhi: return DA_HI;
1858 case Jump::Jls: return DA_LS;
1859 case Jump::Jge: return DA_GE;
1860 case Jump::Jlt: return DA_LT;
1861 case Jump::Jgt: return DA_GT;
1862 case Jump::Jle: return DA_LE;
1863 case Jump::jmp: return DA_AL;
1864 default: TPDE_UNREACHABLE("invalid jump kind for conversion to Da64Cond");
1865 }
1866}
1867
1868template <IRAdaptor Adaptor,
1869 typename Derived,
1870 template <typename, typename, typename> class BaseTy,
1871 typename Config>
1873 Jump cc, AsmReg dst) noexcept {
1874 ASM(CSETw, dst, jump_to_cond(cc));
1875}
1876
1877template <IRAdaptor Adaptor,
1878 typename Derived,
1879 template <typename, typename, typename> class BaseTy,
1880 typename Config>
1882 Jump cc, AsmReg dst) noexcept {
1883 ASM(CSETMx, dst, jump_to_cond(cc));
1884}
1885template <IRAdaptor Adaptor,
1886 typename Derived,
1887 template <typename, typename, typename> class BaseTy,
1888 typename Config>
1890 Jump cc,
1891 AsmReg dst,
1892 AsmReg true_select,
1893 AsmReg false_select,
1894 bool is_64) noexcept {
1895 this->text_writer.ensure_space(4);
1896 Da64Cond cond = jump_to_cond(cc);
1897 if (is_64) {
1898 ASMNC(CSELx, dst, true_select, false_select, cond);
1899 } else {
1900 ASMNC(CSELw, dst, true_select, false_select, cond);
1901 }
1902}
1903
1904template <IRAdaptor Adaptor,
1905 typename Derived,
1906 template <typename, typename, typename> class BaseTy,
1907 typename Config>
1909 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1910 assert(from < to && to <= 64);
1911 (void)to;
1912 if (sign) {
1913 if (to <= 32) {
1914 ASM(SBFXw, dst, src, 0, from);
1915 } else {
1916 ASM(SBFXx, dst, src, 0, from);
1917 }
1918 } else {
1919 if (to <= 32) {
1920 ASM(UBFXw, dst, src, 0, from);
1921 } else {
1922 ASM(UBFXx, dst, src, 0, from);
1923 }
1924 }
1925}
1926
1927template <IRAdaptor Adaptor,
1928 typename Derived,
1929 template <typename, typename, typename> typename BaseTy,
1930 typename Config>
1932 std::variant<SymRef, ValuePart> &&target,
1933 std::span<CallArg> arguments,
1934 typename Base::ValueRef *result,
1935 bool) {
1936 CCAssignerAAPCS assigner;
1937 CallBuilder cb{*derived(), assigner};
1938 for (auto &arg : arguments) {
1939 cb.add_arg(std::move(arg));
1940 }
1941 cb.call(std::move(target));
1942 if (result) {
1943 cb.add_ret(*result);
1944 }
1945}
1946
1947template <IRAdaptor Adaptor,
1948 typename Derived,
1949 template <typename, typename, typename> typename BaseTy,
1950 typename Config>
1951void CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1952 AsmReg cmp_reg, AsmReg tmp_reg, u64 case_value, bool width_is_32) noexcept {
1953 if (width_is_32) {
1954 if (!ASMIF(CMPwi, cmp_reg, case_value)) {
1955 materialize_constant(case_value, Config::GP_BANK, 4, tmp_reg);
1956 ASM(CMPw, cmp_reg, tmp_reg);
1957 }
1958 } else {
1959 if (!ASMIF(CMPxi, cmp_reg, case_value)) {
1960 materialize_constant(case_value, Config::GP_BANK, 4, tmp_reg);
1961 ASM(CMPx, cmp_reg, tmp_reg);
1962 }
1963 }
1964}
1965
1966template <IRAdaptor Adaptor,
1967 typename Derived,
1968 template <typename, typename, typename> typename BaseTy,
1969 typename Config>
1970void CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1971 Label case_label,
1972 AsmReg cmp_reg,
1973 AsmReg tmp_reg,
1974 u64 case_value,
1975 bool width_is_32) noexcept {
1976 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1977 generate_raw_jump(Jump::Jeq, case_label);
1978}
1979
1980template <IRAdaptor Adaptor,
1981 typename Derived,
1982 template <typename, typename, typename> typename BaseTy,
1983 typename Config>
1984bool CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_jump_table(
1985 Label default_label,
1986 std::span<const Label> labels,
1987 AsmReg cmp_reg,
1988 AsmReg tmp_reg,
1989 u64 low_bound,
1990 u64 high_bound,
1991 bool width_is_32) noexcept {
1992 if (low_bound != 0) {
1993 switch_emit_cmp(cmp_reg, tmp_reg, low_bound, width_is_32);
1994 generate_raw_jump(Jump::Jcc, default_label);
1995 }
1996 switch_emit_cmp(cmp_reg, tmp_reg, high_bound, width_is_32);
1997 generate_raw_jump(Jump::Jhi, default_label);
1998
1999 if (low_bound != 0) {
2000 if (!ASMIF(SUBxi, cmp_reg, cmp_reg, low_bound)) {
2001 this->materialize_constant(&low_bound, Config::GP_BANK, 8, tmp_reg);
2002 ASM(SUBx, cmp_reg, cmp_reg, tmp_reg);
2003 }
2004 }
2005
2006 // TODO: move jump table to read-only data section.
2007 this->text_writer.ensure_space(4 * 4 + 4 * labels.size());
2008
2009 Label jump_table = this->text_writer.label_create();
2010 u32 adr_off = this->text_writer.offset();
2011 this->text_writer.write_unchecked(u32(0)); // ADR tmp_reg, patched below.
2012
2013 if (width_is_32) {
2014 ASMNC(LDRSWxr_uxtw, cmp_reg, tmp_reg, cmp_reg, /*scale=*/true);
2015 } else {
2016 ASMNC(LDRSWxr_lsl, cmp_reg, tmp_reg, cmp_reg, /*scale=*/true);
2017 }
2018 ASMNC(ADDx, tmp_reg, tmp_reg, cmp_reg);
2019 ASMNC(BR, tmp_reg);
2020
2021 u32 table_off = this->text_writer.offset();
2022 this->text_writer.label_place(jump_table, table_off);
2023 for (Label label : labels) {
2024 this->text_writer.label_ref(
2025 label, this->text_writer.offset(), LabelFixupKind::AARCH64_JUMP_TABLE);
2026 this->text_writer.write_unchecked(table_off);
2027 }
2028
2029 assert(table_off - adr_off <= 1 * 1024 * 1024); // ADR has a 1 MiB range.
2030 u32 *adr = reinterpret_cast<u32 *>(this->text_writer.begin_ptr() + adr_off);
2031 *adr = de64_ADR(tmp_reg, adr_off, table_off);
2032 return true;
2033}
2034
2035template <IRAdaptor Adaptor,
2036 typename Derived,
2037 template <typename, typename, typename> typename BaseTy,
2038 typename Config>
2039void CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
2040 Label case_label,
2041 Label gt_label,
2042 AsmReg cmp_reg,
2043 AsmReg tmp_reg,
2044 u64 case_value,
2045 bool width_is_32) noexcept {
2046 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
2047 generate_raw_jump(Jump::Jhi, gt_label);
2048}
2049
2050template <IRAdaptor Adaptor,
2051 typename Derived,
2052 template <typename, typename, typename> typename BaseTy,
2053 typename Config>
2054CompilerA64<Adaptor, Derived, BaseTy, Config>::ScratchReg
2056 SymRef sym, TLSModel model) noexcept {
2057 switch (model) {
2058 default: // TODO: implement optimized access for non-gd-model
2059 case TLSModel::GlobalDynamic: {
2060 assert(!this->stack.is_leaf_function);
2061 this->stack.generated_call = true;
2062 ScratchReg r0_scratch{this};
2063 AsmReg r0 = r0_scratch.alloc_specific(AsmReg::R0);
2064 ScratchReg r1_scratch{this};
2065 AsmReg r1 = r1_scratch.alloc_specific(AsmReg::R1);
2066 // The call only clobbers flags, x0, x1, and lr. x0 and x1 are already fixed
2067 // in the scratch registers, so only make sure that lr isn't used otherwise.
2068 if (this->register_file.is_used(Reg{AsmReg::LR})) {
2069 this->evict_reg(Reg{AsmReg::LR});
2070 }
2071
2072 this->text_writer.ensure_space(0x18);
2073 this->reloc_text(
2074 sym, elf::R_AARCH64_TLSDESC_ADR_PAGE21, this->text_writer.offset(), 0);
2075 ASMNC(ADRP, r0, 0, 0);
2076 this->reloc_text(
2077 sym, elf::R_AARCH64_TLSDESC_LD64_LO12, this->text_writer.offset(), 0);
2078 ASMNC(LDRxu, r1, r0, 0);
2079 this->reloc_text(
2080 sym, elf::R_AARCH64_TLSDESC_ADD_LO12, this->text_writer.offset(), 0);
2081 ASMNC(ADDxi, r0, r0, 0);
2082 this->reloc_text(
2083 sym, elf::R_AARCH64_TLSDESC_CALL, this->text_writer.offset(), 0);
2084 ASMNC(BLR, r1);
2085 ASMNC(MRS, r1, 0xde82); // TPIDR_EL0
2086 // TODO: maybe return expr x0+x1.
2087 ASMNC(ADDx, r0, r1, r0);
2088 return r0_scratch;
2089 }
2090 }
2091}
2092
2093} // namespace tpde::a64
Assembler base class.
@ LOCAL
Symbol with local linkage, must be defined.
Helper class to write function text.
AArch64 AAPCS calling convention.
Helper class for building call sequences.
CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
Constructor.
Helper class to write function text for AArch64.
The IRAdaptor specifies the interface with which the IR-independent parts of the compiler interact wi...
Definition IRAdaptor.hpp:91
constexpr Jump(Kind kind, AsmReg cmp_reg, bool cmp_is_32)
Cbz/Cbnz branch.
@ Tbnz
Test single bit and branch if not zero (Xn register)
@ jmp
Unconditional jump.
@ Jge
Signed greater than or equal (N == V)
@ Jls
Unsigned lower or same (!(C == 1 && Z == 0))
@ Jhi
Unsigned higher (C == 1 && Z == 0)
@ Tbz
Test single bit and branch if zero (Xn register)
@ Jmi
Minus, negative (N == 1)
@ Jlo
Unsigned lower (C == 0)
@ Jlt
Signed less than (N != V)
@ Jgt
Signed greater than (Z == 0 && N == V)
@ Jhs
Unsigned higher or same (C == 1)
@ Jvc
No Overflow (V == 0)
@ Cbnz
Compare and branch if not zero (Wn or Xn register)
@ Jle
Signed lessthan or equal (!(Z == 0 && N == V))
@ Jcc
Carry clear (C == 0)
@ Jpl
Plus, positive or zero (N == 0)
@ Cbz
Compare and branch if zero (Wn or Xn register)
constexpr Jump(Kind kind, AsmReg cmp_reg, u8 test_bit)
Tbz/Tbnz branch.
constexpr Jump(Kind kind)
Unconditional or conditional branch based on flags.
constexpr Jump()
Unconditional branch.
void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept
Bitfield insert in zero. src is not modified.
u32 func_arg_stack_add_off
Offset to the add sp, sp, XXX instruction that the argument handling uses to access stack arguments i...
AsmReg permanent_scratch_reg
Permanent scratch register, e.g.
void generate_raw_set(Jump cc, AsmReg dst) noexcept
Set dst to 1 if cc is true, otherwise set it to zero.
std::optional< i32 > prologue_assign_arg_part(ValuePart &&vp, CCAssignment cca) noexcept
Assign argument part.
void alloca_dynamic(u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept
Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
void prologue_begin(CCAssigner *cc_assigner) noexcept
Begin prologue, prepare for assigning arguments.
void materialize_constant(u64 const_u64, RegBank bank, u32 size, AsmReg dst) noexcept
Materialize constant into a register.
void generate_raw_jump(Jump jmp, Label target) noexcept
Generate jump instruction to target label.
void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept
Bitfield insert. src is not modified.
void generate_call(std::variant< SymRef, ValuePart > &&target, std::span< CallArg > arguments, typename Base::ValueRef *result, bool variable_args=false)
Generate a function call.
void prologue_end(CCAssigner *cc_assigner) noexcept
Finish prologue.
void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept
Dynamic alloca of a fixed-size region.
void generate_raw_mask(Jump cc, AsmReg dst) noexcept
Set all bits of dst to 1 if cc is true, otherwise set dst to zero.
Da64Cond jump_to_cond(Jump jmp) noexcept
Convert jump condition to disarms Da64Cond.
void generate_raw_select(Jump cc, AsmReg dst, AsmReg true_select, AsmReg false_select, bool is_64) noexcept
Moves true_select into dst if cc is true, otherwise move false_select into dst.
ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept
Generate code sequence to load address of sym into a register.
void materialize_constant(const u64 *data, RegBank bank, u32 size, AsmReg dst) noexcept
Materialize constant into a register.
void generate_raw_intext(AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept
Integer extension. src is not modified.