TPDE
Loading...
Searching...
No Matches
CompilerA64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/DWARF.hpp"
10#include "tpde/ELF.hpp"
11#include "tpde/arm64/FunctionWriterA64.hpp"
12#include "tpde/base.hpp"
13#include "tpde/util/SmallVector.hpp"
14#include "tpde/util/misc.hpp"
15
16#include <bit>
17#include <disarm64.h>
18
19// Helper macros for assembling in the compiler
20#if defined(ASM) || defined(ASMNC) || defined(ASMC)
21 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
22#endif
23
24/// Encode an instruction with an explicit compiler pointer
25#define ASMC(compiler, op, ...) \
26 ((compiler)->text_writer.write_inst(de64_##op(__VA_ARGS__)))
27/// Encode an instruction into this
28#define ASM(...) ASMC(this, __VA_ARGS__)
29/// Encode an instruction without checking that enough space is available
30#define ASMNC(op, ...) \
31 (this->text_writer.write_inst_unchecked(de64_##op(__VA_ARGS__)))
32/// Encode an instruction if the encoding is successful (returns true)
33#define ASMIFC(compiler, op, ...) \
34 ((compiler)->text_writer.try_write_inst(de64_##op(__VA_ARGS__)))
35/// Encode an instruction if the encoding is successful (returns true)
36#define ASMIF(...) ASMIFC(this, __VA_ARGS__)
37
38namespace tpde::a64 {
39
40struct AsmReg : Reg {
41 enum REG : u8 {
42 R0 = 0,
43 R1,
44 R2,
45 R3,
46 R4,
47 R5,
48 R6,
49 R7,
50 R8,
51 R9,
52 R10,
53 R11,
54 R12,
55 R13,
56 R14,
57 R15,
58 R16,
59 R17,
60 R18,
61 R19,
62 R20,
63 R21,
64 R22,
65 R23,
66 R24,
67 R25,
68 R26,
69 R27,
70 R28,
71 R29,
72 FP = 29,
73 R30,
74 LR = 30,
75 SP = 31,
76
77 V0 = 32,
78 V1,
79 V2,
80 V3,
81 V4,
82 V5,
83 V6,
84 V7,
85 V8,
86 V9,
87 V10,
88 V11,
89 V12,
90 V13,
91 V14,
92 V15,
93 V16,
94 V17,
95 V18,
96 V19,
97 V20,
98 V21,
99 V22,
100 V23,
101 V24,
102 V25,
103 V26,
104 V27,
105 V28,
106 V29,
107 V30,
108 V31
109 };
110
111 constexpr explicit AsmReg() : Reg((u8)0xFF) {}
112
113 constexpr AsmReg(const REG id) : Reg((u8)id) {}
114
115 constexpr AsmReg(const Reg base) : Reg(base) {}
116
117 constexpr explicit AsmReg(const u64 id) : Reg(id) {
118 assert(id <= SP || (id >= V0 && id <= V31));
119 }
120
121 operator DA_GReg() const {
122 assert(reg_id < V0);
123 return DA_GReg{reg_id};
124 }
125
126 operator DA_GRegZR() const {
127 assert(reg_id < V0);
128 assert(reg_id != SP); // 31 means SP in our enums
129 return DA_GRegZR{reg_id};
130 }
131
132 operator DA_GRegSP() const {
133 assert(reg_id <= SP);
134 return DA_GRegSP{reg_id};
135 }
136
137 operator DA_VReg() const {
138 assert(reg_id >= V0 && reg_id <= V31);
139 return DA_VReg{static_cast<u8>(reg_id - V0)};
140 }
141};
142
143constexpr static u64
144 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
145 u64 set = 0;
146 for (const auto reg : regs) {
147 set |= 1ull << reg;
148 }
149 return set;
150}
151
152template <size_t N>
153constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
154 u64 set = 0;
155 for (const auto reg : regs) {
156 set |= 1ull << reg.id();
157 }
158 return set;
159}
160
161/// AArch64 AAPCS calling convention.
162class CCAssignerAAPCS : public CCAssigner {
163 static constexpr CCInfo Info{
164 // we reserve SP,FP,R16 and R17 for our special use cases
165 .allocatable_regs =
166 0xFFFF'FFFF'FFFF'FFFF &
167 ~create_bitmask({AsmReg::SP, AsmReg::FP, AsmReg::R16, AsmReg::R17}),
168 // callee-saved registers
169 .callee_saved_regs = create_bitmask({
170 AsmReg::R19,
171 AsmReg::R20,
172 AsmReg::R21,
173 AsmReg::R22,
174 AsmReg::R23,
175 AsmReg::R24,
176 AsmReg::R25,
177 AsmReg::R26,
178 AsmReg::R27,
179 AsmReg::R28,
180 AsmReg::V8,
181 AsmReg::V9,
182 AsmReg::V10,
183 AsmReg::V11,
184 AsmReg::V12,
185 AsmReg::V13,
186 AsmReg::V14,
187 AsmReg::V15,
188 }),
189 .arg_regs = create_bitmask({
190 AsmReg::R0,
191 AsmReg::R1,
192 AsmReg::R2,
193 AsmReg::R3,
194 AsmReg::R4,
195 AsmReg::R5,
196 AsmReg::R6,
197 AsmReg::R7,
198 AsmReg::R8, // sret register
199 AsmReg::V0,
200 AsmReg::V1,
201 AsmReg::V2,
202 AsmReg::V3,
203 AsmReg::V4,
204 AsmReg::V5,
205 AsmReg::V6,
206 AsmReg::V7,
207 }),
208 };
209
210 // NGRN = Next General-purpose Register Number
211 // NSRN = Next SIMD/FP Register Number
212 // NSAA = Next Stack Argument Address
213 u32 ngrn = 0, nsrn = 0, nsaa = 0;
214 u32 ret_ngrn = 0, ret_nsrn = 0;
215
216public:
217 CCAssignerAAPCS() : CCAssigner(Info) {}
218
219 void reset() override { ngrn = nsrn = nsaa = ret_ngrn = ret_nsrn = 0; }
220
221 void assign_arg(CCAssignment &arg) override {
222 if (arg.byval) [[unlikely]] {
223 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
224 arg.stack_off = nsaa;
225 nsaa += arg.size;
226 return;
227 }
228
229 if (arg.sret) [[unlikely]] {
230 arg.reg = AsmReg{AsmReg::R8};
231 return;
232 }
233
234 if (arg.bank == RegBank{0}) {
235 if (arg.align > 8) {
236 ngrn = util::align_up(ngrn, 2);
237 }
238 if (ngrn + arg.consecutive < 8) {
239 arg.reg = Reg{AsmReg::R0 + ngrn};
240 ngrn += 1;
241 } else {
242 ngrn = 8;
243 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
244 arg.stack_off = nsaa;
245 nsaa += 8;
246 }
247 } else {
248 if (nsrn + arg.consecutive < 8) {
249 arg.reg = Reg{AsmReg::V0 + nsrn};
250 nsrn += 1;
251 } else {
252 nsrn = 8;
253 u32 size = util::align_up(arg.size, 8);
254 nsaa = util::align_up(nsaa, size);
255 arg.stack_off = nsaa;
256 nsaa += size;
257 }
258 }
259 }
260
261 u32 get_stack_size() override { return nsaa; }
262
263 void assign_ret(CCAssignment &arg) override {
264 assert(!arg.byval && !arg.sret);
265 if (arg.bank == RegBank{0}) {
266 if (arg.align > 8) {
267 ret_ngrn = util::align_up(ret_ngrn, 2);
268 }
269 if (ret_ngrn + arg.consecutive < 8) {
270 arg.reg = Reg{AsmReg::R0 + ret_ngrn};
271 ret_ngrn += 1;
272 } else {
273 assert(false);
274 }
275 } else {
276 if (ret_nsrn + arg.consecutive < 8) {
277 arg.reg = Reg{AsmReg::V0 + ret_nsrn};
278 ret_nsrn += 1;
279 } else {
280 assert(false);
281 }
282 }
283 }
284};
285
286struct PlatformConfig : CompilerConfigDefault {
287 using Assembler = tpde::elf::AssemblerElfA64;
288 using AsmReg = tpde::a64::AsmReg;
289 using DefaultCCAssigner = CCAssignerAAPCS;
291
292 static constexpr RegBank GP_BANK{0};
293 static constexpr RegBank FP_BANK{1};
294 static constexpr bool FRAME_INDEXING_NEGATIVE = false;
295 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
296 static constexpr u32 NUM_BANKS = 2;
297};
298
299/// Compiler mixin for targeting AArch64.
300template <IRAdaptor Adaptor,
301 typename Derived,
302 template <typename, typename, typename> typename BaseTy =
303 CompilerBase,
304 typename Config = PlatformConfig>
305struct CompilerA64 : BaseTy<Adaptor, Derived, Config> {
306 using Base = BaseTy<Adaptor, Derived, Config>;
307
308 using IRValueRef = typename Base::IRValueRef;
309 using IRBlockRef = typename Base::IRBlockRef;
310 using IRFuncRef = typename Base::IRFuncRef;
311
312 using ScratchReg = typename Base::ScratchReg;
313 using ValuePartRef = typename Base::ValuePartRef;
314 using ValuePart = typename Base::ValuePart;
315 using GenericValuePart = typename Base::GenericValuePart;
316
317 using RegisterFile = typename Base::RegisterFile;
318
319 using CallArg = typename Base::CallArg;
320
321 using Base::derived;
322
323
324 // TODO(ts): make this dependent on the number of callee-saved regs of the
325 // current function or if there is a call in the function?
326 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
327 6};
328
329 // Maximum frame size is 0xfff000, therefore convert overly large static
330 // allocas directly to dynamic allocations.
331 static constexpr u32 MaxStaticAllocaSize = 0x100000;
332
333 enum CPU_FEATURES : u32 {
334 CPU_BASELINE = 0, // ARMV8.0
335 };
336
337 CPU_FEATURES cpu_feats = CPU_BASELINE;
338
339 // When handling function arguments, we need to prevent argument registers
340 // from being handed out as fixed registers
341 //
342 // Additionally, we prevent R0 and R1 from being fixed assignments to
343 // prevent issues with exception handling
344 u64 fixed_assignment_nonallocatable_mask =
345 create_bitmask({AsmReg::R0, AsmReg::R1});
346 u32 func_start_off = 0u, func_prologue_alloc = 0u;
347 /// Offset to the `add sp, sp, XXX` instruction that the argument handling
348 /// uses to access stack arguments if needed
350 AsmReg func_arg_stack_add_reg = AsmReg::make_invalid();
351
352 /// Permanent scratch register, e.g. to materialize constants/offsets. This is
353 /// used by materialize_constant, load_from_stack, spill_reg.
354 AsmReg permanent_scratch_reg = AsmReg::R16;
355
356 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
357 u32 reg_save_frame_off = 0;
358 util::SmallVector<u32, 8> func_ret_offs = {};
359
360 /// Helper class for building call sequences.
361 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
362 u32 stack_adjust_off = 0;
363 u32 stack_size = 0;
364 u32 stack_sub = 0;
365
366 void set_stack_used();
367
368 public:
369 /// Constructor.
370 CallBuilder(Derived &compiler, CCAssigner &assigner)
371 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
372
373 void add_arg_byval(ValuePart &vp, CCAssignment &cca);
374 void add_arg_stack(ValuePart &vp, CCAssignment &cca);
375 void call_impl(std::variant<SymRef, ValuePart> &&);
376 void reset_stack();
377 };
378
379 // for now, always generate an object
380 explicit CompilerA64(Adaptor *adaptor,
381 const CPU_FEATURES cpu_features = CPU_BASELINE)
382 : Base{adaptor}, cpu_feats(cpu_features) {
383 static_assert(std::is_base_of_v<CompilerA64, Derived>);
384 }
385
386 void start_func(u32) {}
387
388 /// Begin prologue, prepare for assigning arguments.
389 void prologue_begin(CCAssigner *cc_assigner);
390 /// Assign argument part. Returns the stack offset if the value should be
391 /// initialized as stack variable.
392 std::optional<i32> prologue_assign_arg_part(ValuePart &&vp, CCAssignment cca);
393 /// Finish prologue.
394 void prologue_end(CCAssigner *cc_assigner);
395
396 // note: this has to call assembler->end_func
397 void finish_func(u32 func_idx);
398
399 // helpers
400
401 void gen_func_epilog();
402
403 void spill_reg(const AsmReg reg, const u32 frame_off, const u32 size);
404
405 void load_from_stack(AsmReg dst,
406 i32 frame_off,
407 u32 size,
408 bool sign_extend = false);
409
410 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap);
411
412 void mov(AsmReg dst, AsmReg src, u32 size);
413
414 GenericValuePart val_spill_slot(AssignmentPartRef ap) {
415 assert(ap.stack_valid() && !ap.variable_ref());
416 return typename GenericValuePart::Expr(AsmReg::R29, ap.frame_off());
417 }
418
419 AsmReg gval_expr_as_reg(GenericValuePart &gv);
420
421 /// Dynamic alloca of a fixed-size region.
422 void alloca_fixed(u64 size, u32 align, ValuePart &res);
423
424 /// Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
425 /// count must have a size of 64 bit.
426 void alloca_dynamic(u64 elem_size,
427 ValuePart &&count,
428 u32 align,
429 ValuePart &res);
430
431 /// Materialize constant into a register.
432 void
433 materialize_constant(const u64 *data, RegBank bank, u32 size, AsmReg dst);
434 /// Materialize constant into a register.
435 void materialize_constant(u64 const_u64, RegBank bank, u32 size, AsmReg dst) {
436 assert(size <= sizeof(const_u64));
437 materialize_constant(&const_u64, bank, size, dst);
438 }
439
440 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef);
441
442 /// Jump conditions.
443 struct Jump {
444 // TDOO: naming consistency
445 enum Kind : uint8_t {
446 Jeq, ///< Equal (Z == 1)
447 Jne, ///< Not equal (Z == 0)
448 Jcs, ///< Carry set (C == 1)
449 Jhs = Jcs, ///< Unsigned higher or same (C == 1)
450 Jcc, ///< Carry clear (C == 0)
451 Jlo = Jcc, ///< Unsigned lower (C == 0)
452 Jmi, ///< Minus, negative (N == 1)
453 Jpl, ///< Plus, positive or zero (N == 0)
454 Jvs, ///< Overflow (V == 1)
455 Jvc, ///< No Overflow (V == 0)
456 Jhi, ///< Unsigned higher (C == 1 && Z == 0)
457 Jls, ///< Unsigned lower or same (!(C == 1 && Z == 0))
458 Jge, ///< Signed greater than or equal (N == V)
459 Jlt, ///< Signed less than (N != V)
460 Jgt, ///< Signed greater than (Z == 0 && N == V)
461 Jle, ///< Signed lessthan or equal (!(Z == 0 && N == V))
462 jmp, ///< Unconditional jump
463 Cbz, ///< Compare and branch if zero (Wn or Xn register)
464 Cbnz, ///< Compare and branch if not zero (Wn or Xn register)
465 Tbz, ///< Test single bit and branch if zero (Xn register)
466 Tbnz, ///< Test single bit and branch if not zero (Xn register)
467 };
468
469 Kind kind;
470 AsmReg cmp_reg;
471 bool cmp_is_32;
472 u8 test_bit;
473
474 /// Unconditional branch.
475 constexpr Jump() : kind(Kind::jmp) {}
476
477 /// Unconditional or conditional branch based on flags.
478 constexpr Jump(Kind kind) : kind(kind), cmp_is_32(false), test_bit(0) {
479 assert(kind != Cbz && kind != Cbnz && kind != Tbz && kind != Tbnz);
480 }
481
482 /// Cbz/Cbnz branch.
483 constexpr Jump(Kind kind, AsmReg cmp_reg, bool cmp_is_32)
484 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(cmp_is_32), test_bit(0) {
485 assert(kind == Cbz || kind == Cbnz);
486 }
487
488 /// Tbz/Tbnz branch.
489 constexpr Jump(Kind kind, AsmReg cmp_reg, u8 test_bit)
490 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(false), test_bit(test_bit) {
491 assert(kind == Tbz || kind == Tbnz);
492 }
493
494 constexpr Jump change_kind(Kind new_kind) const {
495 auto cpy = *this;
496 cpy.kind = new_kind;
497 return cpy;
498 }
499 };
500
501 Jump invert_jump(Jump jmp);
502 Jump swap_jump(Jump jmp);
503
504 /// Generate jump instruction to target label.
505 void generate_raw_jump(Jump jmp, Label target);
506
507 /// Convert jump condition to disarms Da64Cond.
508 /// \warning Cbz,Cbnz,Tbz and Tbnz are not supported
509 Da64Cond jump_to_cond(Jump jmp);
510 /// Set dst to 1 if cc is true, otherwise set it to zero
511 void generate_raw_set(Jump cc, AsmReg dst);
512 /// Set all bits of dst to 1 if cc is true, otherwise set dst to zero
513 void generate_raw_mask(Jump cc, AsmReg dst);
514
515 /// Moves true_select into dst if cc is true,
516 /// otherwise move false_select into dst
518 Jump cc, AsmReg dst, AsmReg true_select, AsmReg false_select, bool is_64);
519
520 /// Integer extension. src is not modified.
521 void generate_raw_intext(AsmReg dst, AsmReg src, bool sign, u32 from, u32 to);
522
523 /// Bitfield insert. src is not modified.
524 void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) {
525 ASM(BFIx, dst, src, lsb, width);
526 }
527 /// Bitfield insert in zero. src is not modified.
528 void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) {
529 ASM(UBFIZx, dst, src, lsb, width);
530 }
531
532 /// Generate a function call
533 ///
534 /// This will get the arguments into the correct registers according to the
535 /// calling convention, clear non-callee-saved registers from the register
536 /// file (make sure you do not have any fixed assignments left over) and
537 /// fill the result registers (the u8 in the ScratchReg pair indicates the
538 /// register bank)
539 ///
540 /// Targets can be a symbol (call to PLT with relocation), or an indirect
541 /// call to a ValuePart. Result is an optional reference.
542 void generate_call(std::variant<SymRef, ValuePart> &&target,
543 std::span<CallArg> arguments,
544 typename Base::ValueRef *result,
545 bool variable_args = false);
546
547private:
548 /// @internal Emit compare of cmp_reg with case_value.
549 void switch_emit_cmp(AsmReg cmp_reg,
550 AsmReg tmp_reg,
551 u64 case_value,
552 bool width_is_32);
553
554public:
555 /// @internal Jump if cmp_reg equals case_value.
556 void switch_emit_cmpeq(Label case_label,
557 AsmReg cmp_reg,
558 AsmReg tmp_reg,
559 u64 case_value,
560 bool width_is_32);
561 /// @internal Emit bounds check and create jump table.
562 FunctionWriterBase::JumpTable *switch_create_jump_table(Label default_label,
563 AsmReg cmp_reg,
564 AsmReg tmp_reg,
565 u64 low_bound,
566 u64 high_bound,
567 bool width_is_32);
568 /// @internal Jump if cmp_reg is greater than case_value.
569 void switch_emit_binary_step(Label case_label,
570 Label gt_label,
571 AsmReg cmp_reg,
572 AsmReg tmp_reg,
573 u64 case_value,
574 bool width_is_32);
575
576 /// Generate code sequence to load address of sym into a register. This will
577 /// generate a function call for dynamic TLS access models.
578 ScratchReg tls_get_addr(SymRef sym, TLSModel model);
579
580 bool has_cpu_feats(CPU_FEATURES feats) const {
581 return ((cpu_feats & feats) == feats);
582 }
583};
584
585template <IRAdaptor Adaptor,
586 typename Derived,
587 template <typename, typename, typename> class BaseTy,
588 typename Config>
589void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
590 set_stack_used() {
591 if (stack_adjust_off == 0) {
592 this->compiler.text_writer.ensure_space(16);
593 stack_adjust_off = this->compiler.text_writer.offset();
594 this->compiler.text_writer.cur_ptr() += 4;
595 }
596}
597
598template <IRAdaptor Adaptor,
599 typename Derived,
600 template <typename, typename, typename> class BaseTy,
601 typename Config>
602void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
603 ValuePart &vp, CCAssignment &cca) {
604 AsmReg ptr_reg = vp.load_to_reg(&this->compiler);
605 AsmReg tmp_reg = AsmReg::R16;
606
607 auto size = cca.size;
608 set_stack_used();
609 for (u32 off = 0; off < size;) {
610 if (size - off >= 8) {
611 ASMC(&this->compiler, LDRxu, tmp_reg, ptr_reg, off);
612 ASMC(&this->compiler, STRxu, tmp_reg, DA_SP, cca.stack_off + off);
613 off += 8;
614 } else if (size - off >= 4) {
615 ASMC(&this->compiler, LDRwu, tmp_reg, ptr_reg, off);
616 ASMC(&this->compiler, STRwu, tmp_reg, DA_SP, cca.stack_off + off);
617 off += 4;
618 } else if (size - off >= 2) {
619 ASMC(&this->compiler, LDRHu, tmp_reg, ptr_reg, off);
620 ASMC(&this->compiler, STRHu, tmp_reg, DA_SP, cca.stack_off + off);
621 off += 2;
622 } else {
623 ASMC(&this->compiler, LDRBu, tmp_reg, ptr_reg, off);
624 ASMC(&this->compiler, STRBu, tmp_reg, DA_SP, cca.stack_off + off);
625 off += 1;
626 }
627 }
628}
629
630template <IRAdaptor Adaptor,
631 typename Derived,
632 template <typename, typename, typename> class BaseTy,
633 typename Config>
634void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
635 ValuePart &vp, CCAssignment &cca) {
636 set_stack_used();
637
638 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
639 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
640 switch (cca.size) {
641 case 1: ASMC(&this->compiler, STRBu, reg, DA_SP, cca.stack_off); break;
642 case 2: ASMC(&this->compiler, STRHu, reg, DA_SP, cca.stack_off); break;
643 case 4: ASMC(&this->compiler, STRwu, reg, DA_SP, cca.stack_off); break;
644 case 8: ASMC(&this->compiler, STRxu, reg, DA_SP, cca.stack_off); break;
645 default: TPDE_UNREACHABLE("invalid GP reg size");
646 }
647 } else {
648 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
649 switch (cca.size) {
650 case 1: ASMC(&this->compiler, STRbu, reg, DA_SP, cca.stack_off); break;
651 case 2: ASMC(&this->compiler, STRhu, reg, DA_SP, cca.stack_off); break;
652 case 4: ASMC(&this->compiler, STRsu, reg, DA_SP, cca.stack_off); break;
653 case 8: ASMC(&this->compiler, STRdu, reg, DA_SP, cca.stack_off); break;
654 case 16: ASMC(&this->compiler, STRqu, reg, DA_SP, cca.stack_off); break;
655 default: TPDE_UNREACHABLE("invalid FP reg size");
656 }
657 }
658}
659
660template <IRAdaptor Adaptor,
661 typename Derived,
662 template <typename, typename, typename> class BaseTy,
663 typename Config>
664void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
665 std::variant<SymRef, ValuePart> &&target) {
666 u32 sub = 0;
667 if (stack_adjust_off != 0) {
668 auto *text_data = this->compiler.text_writer.begin_ptr();
669 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + stack_adjust_off);
670 u32 stack_size = this->assigner.get_stack_size();
671 sub = util::align_up(stack_size, stack_size < 0x1000 ? 0x10 : 0x1000);
672 *write_ptr = de64_SUBxi(DA_SP, DA_SP, sub);
673 } else {
674 assert(this->assigner.get_stack_size() == 0);
675 }
676
677 // For vector registers, only the lowest half is callee-saved. Evict all
678 // value parts larger than 8 bytes.
679 auto fp_regs = RegisterFile::bank_regs(Config::FP_BANK);
680 auto fp_csrs = fp_regs & this->assigner.get_ccinfo().callee_saved_regs;
681 auto used_fp_csrs = fp_csrs & this->compiler.register_file.used;
682 for (auto reg_id : util::BitSetIterator<>{used_fp_csrs}) {
683 Reg reg{reg_id};
684 ValLocalIdx local_idx = this->compiler.register_file.reg_local_idx(reg);
685 auto part = this->compiler.register_file.reg_part(reg);
686 AssignmentPartRef ap{this->compiler.val_assignment(local_idx), part};
687 if (ap.part_size() > 8) {
688 this->compiler.evict(ap);
689 }
690 }
691
692 if (auto *sym = std::get_if<SymRef>(&target)) {
693 ASMC(&this->compiler, BL, 0);
694 this->compiler.reloc_text(
695 *sym, elf::R_AARCH64_CALL26, this->compiler.text_writer.offset() - 4);
696 } else {
697 ValuePart &tvp = std::get<ValuePart>(target);
698 if (tvp.can_salvage()) {
699 ASMC(&this->compiler, BLR, tvp.salvage(&this->compiler));
700 } else {
701 AsmReg reg = this->compiler.permanent_scratch_reg;
702 tvp.reload_into_specific_fixed(&this->compiler, reg);
703 ASMC(&this->compiler, BLR, reg);
704 }
705 tvp.reset(&this->compiler);
706 }
707
708 if (stack_adjust_off != 0) {
709 ASMC(&this->compiler, ADDxi, DA_SP, DA_SP, sub);
710 }
711}
712
713template <IRAdaptor Adaptor,
714 typename Derived,
715 template <typename, typename, typename> typename BaseTy,
716 typename Config>
718 CCAssigner *cc_assigner) {
719 func_ret_offs.clear();
720 func_start_off = this->text_writer.offset();
721
722 const CCInfo &cc_info = cc_assigner->get_ccinfo();
723
724 // We don't actually generate the prologue here and merely allocate space
725 // for it. Right now, we don't know which callee-saved registers will be
726 // used. While we could pad with nops, we later move the beginning of the
727 // function so that small functions don't have to execute 9 nops.
728 // See finish_func.
729 this->stack.frame_size = 16; // FP, LR
730 {
731 auto csr = cc_info.callee_saved_regs;
732 auto csr_gp = csr & this->register_file.bank_regs(Config::GP_BANK);
733 auto csr_fp = csr & this->register_file.bank_regs(Config::FP_BANK);
734 u32 gp_saves = std::popcount(csr_gp);
735 u32 fp_saves = std::popcount(csr_fp);
736 // LDP/STP can handle two registers of the same bank.
737 u32 reg_save_size = 4 * ((gp_saves + 1) / 2 + (fp_saves + 1) / 2);
738 // TODO: support CSR of Qx/Vx registers, not just Dx
739 this->stack.frame_size += util::align_up(gp_saves * 8 + fp_saves * 8, 16);
740
741 // Reserve space for sub sp, stp x29/x30, and mov x29, sp.
742 func_prologue_alloc = reg_save_size + 12;
743 this->text_writer.ensure_space(func_prologue_alloc);
744 this->text_writer.cur_ptr() += func_prologue_alloc;
745 }
746
747 // TODO(ts): support larger stack alignments?
748
749 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
750 this->stack.frame_used = true;
751 reg_save_frame_off = this->stack.frame_size;
752 // We additionally store a pointer to the stack area, which we can't compute
753 // with a constant offset from the frame pointer. Add 16 bytes to maintain
754 // alignment.
755 this->stack.frame_size += 8 * 8 + 8 * 16 + 16;
756 this->text_writer.ensure_space(4 * 8);
757 ASMNC(STPx, DA_GP(0), DA_GP(1), DA_SP, reg_save_frame_off);
758 ASMNC(STPx, DA_GP(2), DA_GP(3), DA_SP, reg_save_frame_off + 16);
759 ASMNC(STPx, DA_GP(4), DA_GP(5), DA_SP, reg_save_frame_off + 32);
760 ASMNC(STPx, DA_GP(6), DA_GP(7), DA_SP, reg_save_frame_off + 48);
761 ASMNC(STPq, DA_V(0), DA_V(1), DA_SP, reg_save_frame_off + 64);
762 ASMNC(STPq, DA_V(2), DA_V(3), DA_SP, reg_save_frame_off + 96);
763 ASMNC(STPq, DA_V(4), DA_V(5), DA_SP, reg_save_frame_off + 128);
764 ASMNC(STPq, DA_V(6), DA_V(7), DA_SP, reg_save_frame_off + 160);
765 }
766
767 this->func_arg_stack_add_off = ~0u;
768}
769
770template <IRAdaptor Adaptor,
771 typename Derived,
772 template <typename, typename, typename> typename BaseTy,
773 typename Config>
774std::optional<i32>
776 ValuePart &&vp, CCAssignment cca) {
777 if (cca.reg.valid()) [[likely]] {
778 vp.set_value_reg(this, cca.reg);
779 // Mark register as allocatable as soon as it is assigned. If the argument
780 // is unused, the register will be freed immediately and can be used for
781 // later stack arguments.
782 this->register_file.allocatable |= u64{1} << cca.reg.id();
783 return {};
784 }
785
786 AsmReg dst = vp.alloc_reg(this);
787
788 this->text_writer.ensure_space(8);
789 AsmReg stack_reg = AsmReg::R17;
790 // TODO: allocate an actual scratch register for this.
791 assert(!(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
792 "x17 must not be allocatable");
793 if (this->func_arg_stack_add_off == ~0u) {
794 this->func_arg_stack_add_off = this->text_writer.offset();
795 this->func_arg_stack_add_reg = stack_reg;
796 // Fixed in finish_func when frame size is known
797 ASMNC(ADDxi, stack_reg, DA_SP, 0);
798 }
799
800 if (cca.byval) {
801 ASMNC(ADDxi, dst, stack_reg, cca.stack_off);
802 } else if (cca.bank == Config::GP_BANK) {
803 switch (cca.size) {
804 case 1: ASMNC(LDRBu, dst, stack_reg, cca.stack_off); break;
805 case 2: ASMNC(LDRHu, dst, stack_reg, cca.stack_off); break;
806 case 4: ASMNC(LDRwu, dst, stack_reg, cca.stack_off); break;
807 case 8: ASMNC(LDRxu, dst, stack_reg, cca.stack_off); break;
808 default: TPDE_UNREACHABLE("invalid GP reg size");
809 }
810 } else {
811 assert(cca.bank == Config::FP_BANK);
812 switch (cca.size) {
813 case 1: ASMNC(LDRbu, dst, stack_reg, cca.stack_off); break;
814 case 2: ASMNC(LDRhu, dst, stack_reg, cca.stack_off); break;
815 case 4: ASMNC(LDRsu, dst, stack_reg, cca.stack_off); break;
816 case 8: ASMNC(LDRdu, dst, stack_reg, cca.stack_off); break;
817 case 16: ASMNC(LDRqu, dst, stack_reg, cca.stack_off); break;
818 default: TPDE_UNREACHABLE("invalid FP reg size");
819 }
820 }
821 return {};
822}
823
824template <IRAdaptor Adaptor,
825 typename Derived,
826 template <typename, typename, typename> typename BaseTy,
827 typename Config>
829 CCAssigner *cc_assigner) {
830 // Hack: we don't know the frame size, so for a va_start(), we cannot easily
831 // compute the offset from the frame pointer. But we have a stack_reg here,
832 // so use it for var args.
833 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
834 this->stack.frame_used = true;
835 AsmReg stack_reg = AsmReg::R17;
836 // TODO: allocate an actual scratch register for this.
837 assert(!(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
838 "x17 must not be allocatable");
839 if (this->func_arg_stack_add_off == ~0u) {
840 this->func_arg_stack_add_off = this->text_writer.offset();
841 this->func_arg_stack_add_reg = stack_reg;
842 // Fixed in finish_func when frame size is known
843 ASMC(this, ADDxi, stack_reg, DA_SP, 0);
844 }
845 ASM(ADDxi, stack_reg, stack_reg, cc_assigner->get_stack_size());
846 ASM(STRxu, stack_reg, DA_GP(29), this->reg_save_frame_off + 192);
847
848 // TODO: extract ngrn/nsrn from CCAssigner
849 // TODO: this isn't quite accurate, e.g. for (i128, i128, i128, i64, i128),
850 // this should be 8 but will end up with 7.
851 const CCInfo &cc_info = cc_assigner->get_ccinfo();
852 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
853 u32 ngrn = 8 - util::cnt_lz<u16>((arg_regs & 0xff) << 8 | 0x80);
854 u32 nsrn = 8 - util::cnt_lz<u16>(((arg_regs >> 32) & 0xff) << 8 | 0x80);
855 this->scalar_arg_count = ngrn;
856 this->vec_arg_count = nsrn;
857 }
858}
859
860template <IRAdaptor Adaptor,
861 typename Derived,
862 template <typename, typename, typename> typename BaseTy,
863 typename Config>
864void CompilerA64<Adaptor, Derived, BaseTy, Config>::finish_func(u32 func_idx) {
865 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
866 u64 saved_regs = this->register_file.clobbered & csr;
867
868 auto stack_reg = DA_SP;
869 if (this->stack.has_dynamic_alloca) {
870 stack_reg = DA_GP(29);
871 }
872
873 auto final_frame_size = util::align_up(this->stack.frame_size, 16);
874 if (final_frame_size > 4095) {
875 // round up to 4k since SUB cannot encode immediates greater than 4095
876 final_frame_size = util::align_up(final_frame_size, 4096);
877 assert(final_frame_size < 16 * 1024 * 1024);
878 }
879
880 bool needs_stack_frame =
881 this->stack.frame_used || this->stack.generated_call ||
882 this->stack.has_dynamic_alloca || saved_regs != 0 ||
883 (this->register_file.clobbered & (u64{1} << AsmReg::LR));
884
885 this->text_writer.eh_begin_fde(this->get_personality_sym());
886
887 u32 prologue_size = 0;
888 if (needs_stack_frame) [[likely]] {
889 // NB: code alignment factor 4, data alignment factor -8.
890 util::SmallVector<u32, 16> prologue;
891 // For small stack frames, remember the state at the very beginning, which
892 // is identical to the state after the post-increment LDP. For large stack
893 // frames, remember the state after the SP adjustment (encoding the
894 // corresponding DW_def_cfa SP, framesize would be >=3 bytes; this way we
895 // can get away with a DW_def_cfa_offset 0 after the ADD).
896 if (!func_ret_offs.empty() && final_frame_size <= 0x1f8) {
897 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
898 }
899 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
900 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset,
901 final_frame_size);
902 if (final_frame_size <= 0x1f8) {
903 prologue.push_back(
904 de64_STPx_pre(DA_GP(29), DA_GP(30), DA_SP, -int(final_frame_size)));
905 prologue.push_back(de64_MOV_SPx(DA_GP(29), DA_SP));
906 } else {
907 if (!func_ret_offs.empty()) {
908 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
909 }
910 prologue.push_back(de64_SUBxi(DA_SP, DA_SP, final_frame_size));
911 prologue.push_back(de64_STPx(DA_GP(29), DA_GP(30), DA_SP, 0));
912 prologue.push_back(de64_MOV_SPx(DA_GP(29), DA_SP));
913 }
914
915 // Patched below
916 auto fde_prologue_adv_off = this->text_writer.eh_writer.size();
917 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
918 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
919 dwarf::a64::DW_reg_fp);
920 this->text_writer.eh_write_inst(
921 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_fp, final_frame_size / 8);
922 this->text_writer.eh_write_inst(
923 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_lr, final_frame_size / 8 - 1);
924
925 AsmReg last_reg = AsmReg::make_invalid();
926 u32 frame_off = 16;
927 for (auto reg : util::BitSetIterator{saved_regs}) {
928 u8 dwarf_base = reg < 32 ? dwarf::a64::DW_reg_x0 : dwarf::a64::DW_reg_v0;
929 u8 dwarf_reg = dwarf_base + reg % 32;
930 u32 cfa_off = (final_frame_size - frame_off) / 8 - last_reg.valid();
931 if ((dwarf_reg & dwarf::DWARF_CFI_PRIMARY_OPCODE_MASK) == 0) {
932 this->text_writer.eh_write_inst(
933 dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
934 } else {
935 this->text_writer.eh_write_inst(
936 dwarf::DW_CFA_offset_extended, dwarf_reg, cfa_off);
937 }
938
939 if (last_reg.valid()) {
940 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
941 const auto last_bank = this->register_file.reg_bank(last_reg);
942 if (reg_bank == last_bank) {
943 if (reg_bank == Config::GP_BANK) {
944 prologue.push_back(
945 de64_STPx(last_reg, AsmReg{reg}, stack_reg, frame_off));
946 } else {
947 prologue.push_back(
948 de64_STPd(last_reg, AsmReg{reg}, stack_reg, frame_off));
949 }
950 frame_off += 16;
951 last_reg = AsmReg::make_invalid();
952 } else {
953 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
954 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
955 frame_off += 8;
956 last_reg = AsmReg{reg};
957 }
958 } else {
959 last_reg = AsmReg{reg};
960 }
961 }
962
963 if (last_reg.valid()) {
964 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
965 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
966 } else {
967 assert(this->register_file.reg_bank(last_reg) == Config::FP_BANK);
968 prologue.push_back(de64_STRdu(last_reg, stack_reg, frame_off));
969 }
970 }
971
972 assert(prologue.size() * sizeof(u32) <= func_prologue_alloc);
973
974 assert(prologue.size() < 0x4c);
975 this->text_writer.eh_writer.data()[fde_prologue_adv_off] =
976 dwarf::DW_CFA_advance_loc | (prologue.size() - 1);
977
978 std::memcpy(this->text_writer.begin_ptr() + func_start_off,
979 prologue.data(),
980 prologue.size() * sizeof(u32));
981
982 prologue_size = prologue.size() * sizeof(u32);
983 }
984
985 if (func_arg_stack_add_off != ~0u) {
986 auto *raw_inst_ptr = this->text_writer.begin_ptr() + func_arg_stack_add_off;
987 u32 *inst_ptr = reinterpret_cast<u32 *>(raw_inst_ptr);
988 if (needs_stack_frame) {
989 *inst_ptr = de64_ADDxi(func_arg_stack_add_reg, DA_SP, final_frame_size);
990 } else {
991 *inst_ptr = de64_MOV_SPx(func_arg_stack_add_reg, DA_SP);
992 }
993 }
994
995 if (!func_ret_offs.empty()) {
996 u8 *text_data = this->text_writer.begin_ptr();
997 if (func_ret_offs.back() == this->text_writer.offset() - 4) {
998 this->text_writer.cur_ptr() -= 4;
999 func_ret_offs.pop_back();
1000 }
1001 for (auto ret_off : func_ret_offs) {
1002 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + ret_off);
1003 *write_ptr = de64_B((this->text_writer.offset() - ret_off) / 4);
1004 }
1005
1006 // Epilogue mirrors prologue + RET
1007 this->text_writer.ensure_space(prologue_size + 4);
1008
1009 if (this->stack.has_dynamic_alloca) {
1010 ASMNC(MOV_SPx, DA_SP, DA_GP(29));
1011 }
1012
1013 AsmReg last_reg = AsmReg::make_invalid();
1014 u32 frame_off = 16;
1015 for (auto reg : util::BitSetIterator{saved_regs}) {
1016 if (last_reg.valid()) {
1017 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
1018 const auto last_bank = this->register_file.reg_bank(last_reg);
1019 if (reg_bank == last_bank) {
1020 if (reg_bank == Config::GP_BANK) {
1021 ASMNC(LDPx, last_reg, AsmReg{reg}, stack_reg, frame_off);
1022 } else {
1023 ASMNC(LDPd, last_reg, AsmReg{reg}, stack_reg, frame_off);
1024 }
1025 frame_off += 16;
1026 last_reg = AsmReg::make_invalid();
1027 } else {
1028 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
1029 ASMNC(LDRxu, last_reg, stack_reg, frame_off);
1030 frame_off += 8;
1031 last_reg = AsmReg{reg};
1032 }
1033 continue;
1034 }
1035
1036 last_reg = AsmReg{reg};
1037 }
1038
1039 if (last_reg.valid()) {
1040 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
1041 ASMNC(LDRxu, last_reg, stack_reg, frame_off);
1042 } else {
1043 ASMNC(LDRdu, last_reg, stack_reg, frame_off);
1044 }
1045 }
1046 if (needs_stack_frame) {
1047 u32 body_start = func_start_off + func_prologue_alloc;
1048 this->text_writer.eh_advance(this->text_writer.offset() - body_start + 4);
1049 this->text_writer.eh_write_inst(dwarf::DW_CFA_restore_state);
1050 if (final_frame_size <= 0x1f8) {
1051 ASMNC(LDPx_post, DA_GP(29), DA_GP(30), DA_SP, final_frame_size);
1052 // CFI is correct here.
1053 } else {
1054 ASMNC(LDPx, DA_GP(29), DA_GP(30), DA_SP, 0);
1055 // CFI is correct here, but we need to update the CFA after the ADD.
1056 ASMNC(ADDxi, DA_SP, DA_SP, final_frame_size);
1057 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
1058 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 0);
1059 }
1060 }
1061
1062 ASMNC(RET, DA_GP(30));
1063 }
1064
1065 // TODO(ts): honor cur_needs_unwind_info
1066 this->text_writer.remove_prologue_bytes(func_start_off + prologue_size,
1067 func_prologue_alloc - prologue_size);
1068 auto func_size = this->text_writer.offset() - func_start_off;
1069 auto func_sym = this->func_syms[func_idx];
1070 auto func_sec = this->text_writer.get_sec_ref();
1071 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
1072 this->text_writer.eh_end_fde();
1073 this->text_writer.except_encode_func();
1074}
1075
1076template <IRAdaptor Adaptor,
1077 typename Derived,
1078 template <typename, typename, typename> typename BaseTy,
1079 typename Config>
1080void CompilerA64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() {
1081 // Patched at the end, just reserve the space here.
1082 func_ret_offs.push_back(this->text_writer.offset());
1083 this->text_writer.ensure_space(4); // Single branch to actual epilogue.
1084 this->text_writer.cur_ptr() += 4;
1085}
1086
1087template <IRAdaptor Adaptor,
1088 typename Derived,
1089 template <typename, typename, typename> typename BaseTy,
1090 typename Config>
1091void CompilerA64<Adaptor, Derived, BaseTy, Config>::spill_reg(
1092 const AsmReg reg, const u32 frame_off, const u32 size) {
1093 assert(this->stack.frame_used);
1094 assert((size & (size - 1)) == 0);
1095 assert(util::align_up(frame_off, size) == frame_off);
1096 // We don't support stack frames that aren't encodeable with add/sub.
1097 assert(frame_off < 0x1'000'000);
1098 this->text_writer.ensure_space(8);
1099
1100 u32 off = frame_off;
1101 auto addr_base = AsmReg{AsmReg::FP};
1102 if (off >= 0x1000 * size) [[unlikely]] {
1103 // We cannot encode the offset in the store instruction.
1104 ASMNC(ADDxi, permanent_scratch_reg, DA_GP(29), off & ~0xfff);
1105 off &= 0xfff;
1106 addr_base = permanent_scratch_reg;
1107 }
1108
1109 assert(-static_cast<i32>(frame_off) < 0);
1110 if (reg.id() <= AsmReg::R30) {
1111 switch (size) {
1112 case 1: ASMNC(STRBu, reg, addr_base, off); break;
1113 case 2: ASMNC(STRHu, reg, addr_base, off); break;
1114 case 4: ASMNC(STRwu, reg, addr_base, off); break;
1115 case 8: ASMNC(STRxu, reg, addr_base, off); break;
1116 default: TPDE_UNREACHABLE("invalid register spill size");
1117 }
1118 } else {
1119 switch (size) {
1120 case 1: ASMNC(STRbu, reg, addr_base, off); break;
1121 case 2: ASMNC(STRhu, reg, addr_base, off); break;
1122 case 4: ASMNC(STRsu, reg, addr_base, off); break;
1123 case 8: ASMNC(STRdu, reg, addr_base, off); break;
1124 case 16: ASMNC(STRqu, reg, addr_base, off); break;
1125 default: TPDE_UNREACHABLE("invalid register spill size");
1126 }
1127 }
1128}
1129
1130template <IRAdaptor Adaptor,
1131 typename Derived,
1132 template <typename, typename, typename> typename BaseTy,
1133 typename Config>
1134void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
1135 const AsmReg dst,
1136 const i32 frame_off,
1137 const u32 size,
1138 const bool sign_extend) {
1139 assert(this->stack.frame_used);
1140 assert((size & (size - 1)) == 0);
1141 assert(util::align_up(frame_off, size) == frame_off);
1142 // We don't support stack frames that aren't encodeable with add/sub.
1143 assert(frame_off >= 0 && frame_off < 0x1'000'000);
1144 this->text_writer.ensure_space(8);
1145
1146 u32 off = frame_off;
1147 auto addr_base = AsmReg{AsmReg::FP};
1148 if (off >= 0x1000 * size) [[unlikely]] {
1149 // need to calculate this explicitly
1150 addr_base = dst.id() <= AsmReg::R30 ? dst : permanent_scratch_reg;
1151 ASMNC(ADDxi, addr_base, DA_GP(29), off & ~0xfff);
1152 off &= 0xfff;
1153 }
1154
1155 if (dst.id() <= AsmReg::R30) {
1156 if (!sign_extend) {
1157 switch (size) {
1158 case 1: ASMNC(LDRBu, dst, addr_base, off); break;
1159 case 2: ASMNC(LDRHu, dst, addr_base, off); break;
1160 case 4: ASMNC(LDRwu, dst, addr_base, off); break;
1161 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1162 default: TPDE_UNREACHABLE("invalid register spill size");
1163 }
1164 } else {
1165 switch (size) {
1166 case 1: ASMNC(LDRSBwu, dst, addr_base, off); break;
1167 case 2: ASMNC(LDRSHwu, dst, addr_base, off); break;
1168 case 4: ASMNC(LDRSWxu, dst, addr_base, off); break;
1169 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1170 default: TPDE_UNREACHABLE("invalid register spill size");
1171 }
1172 }
1173 return;
1174 }
1175
1176 assert(!sign_extend);
1177
1178 switch (size) {
1179 case 1: ASMNC(LDRbu, dst, addr_base, off); break;
1180 case 2: ASMNC(LDRhu, dst, addr_base, off); break;
1181 case 4: ASMNC(LDRsu, dst, addr_base, off); break;
1182 case 8: ASMNC(LDRdu, dst, addr_base, off); break;
1183 case 16: ASMNC(LDRqu, dst, addr_base, off); break;
1184 default: TPDE_UNREACHABLE("invalid register spill size");
1185 }
1186}
1187
1188template <IRAdaptor Adaptor,
1189 typename Derived,
1190 template <typename, typename, typename> typename BaseTy,
1191 typename Config>
1192void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
1193 const AsmReg dst, const AssignmentPartRef ap) {
1194 assert(this->stack.frame_used);
1195 auto frame_off = ap.variable_stack_off();
1196 assert(frame_off >= 0);
1197 if (!ASMIF(ADDxi, dst, DA_GP(29), frame_off)) {
1198 materialize_constant(frame_off, Config::GP_BANK, 4, dst);
1199 ASM(ADDx_uxtw, dst, DA_GP(29), dst, 0);
1200 }
1201}
1202
1203template <IRAdaptor Adaptor,
1204 typename Derived,
1205 template <typename, typename, typename> typename BaseTy,
1206 typename Config>
1207void CompilerA64<Adaptor, Derived, BaseTy, Config>::mov(const AsmReg dst,
1208 const AsmReg src,
1209 const u32 size) {
1210 this->text_writer.ensure_space(4);
1211 assert(dst.valid());
1212 assert(src.valid());
1213 if (dst.id() <= AsmReg::SP && src.id() <= AsmReg::SP) {
1214 assert(dst.id() != AsmReg::SP && src.id() != AsmReg::SP);
1215 if (size > 4) {
1216 ASMNC(MOVx, dst, src);
1217 } else {
1218 ASMNC(MOVw, dst, src);
1219 }
1220 } else if (dst.id() >= AsmReg::V0 && src.id() >= AsmReg::V0) {
1221 ASMNC(ORR16b, dst, src, src);
1222 } else if (dst.id() <= AsmReg::SP) {
1223 assert(dst.id() != AsmReg::SP);
1224 // gp<-vector
1225 assert(src.id() >= AsmReg::V0);
1226 assert(size <= 8);
1227 if (size <= 4) {
1228 ASMNC(FMOVws, dst, src);
1229 } else {
1230 ASMNC(FMOVxd, dst, src);
1231 }
1232 } else {
1233 // vector<-gp
1234 assert(src.id() <= AsmReg::R30);
1235 assert(dst.id() >= AsmReg::V0);
1236 assert(size <= 8);
1237 if (size <= 4) {
1238 ASMNC(FMOVsw, dst, src);
1239 } else {
1240 ASMNC(FMOVdx, dst, src);
1241 }
1242 }
1243}
1244
1245template <IRAdaptor Adaptor,
1246 typename Derived,
1247 template <typename, typename, typename> typename BaseTy,
1248 typename Config>
1249AsmReg CompilerA64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1250 GenericValuePart &gv) {
1251 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1252
1253 ScratchReg scratch{derived()};
1254 if (!expr.has_base() && !expr.has_index()) {
1255 AsmReg dst = scratch.alloc_gp();
1256 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, dst);
1257 expr.disp = 0;
1258 } else if (!expr.has_base() && expr.has_index()) {
1259 AsmReg index_reg = expr.index_reg();
1260 if (std::holds_alternative<ScratchReg>(expr.index)) {
1261 scratch = std::move(std::get<ScratchReg>(expr.index));
1262 } else {
1263 (void)scratch.alloc_gp();
1264 }
1265 AsmReg dst = scratch.cur_reg();
1266 if ((expr.scale & (expr.scale - 1)) == 0) {
1267 const auto shift = util::cnt_tz<u64>(expr.scale);
1268 ASM(LSLxi, dst, index_reg, shift);
1269 } else {
1270 AsmReg tmp2 = permanent_scratch_reg;
1271 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1272 ASM(MULx, dst, index_reg, tmp2);
1273 }
1274 } else if (expr.has_base() && expr.has_index()) {
1275 AsmReg base_reg = expr.base_reg();
1276 AsmReg index_reg = expr.index_reg();
1277 if (std::holds_alternative<ScratchReg>(expr.base)) {
1278 scratch = std::move(std::get<ScratchReg>(expr.base));
1279 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1280 scratch = std::move(std::get<ScratchReg>(expr.index));
1281 } else {
1282 (void)scratch.alloc_gp();
1283 }
1284 AsmReg dst = scratch.cur_reg();
1285 if ((expr.scale & (expr.scale - 1)) == 0) {
1286 const auto shift = util::cnt_tz<u64>(expr.scale);
1287 ASM(ADDx_lsl, dst, base_reg, index_reg, shift);
1288 } else {
1289 AsmReg tmp2 = permanent_scratch_reg;
1290 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1291 ASM(MADDx, dst, index_reg, tmp2, base_reg);
1292 }
1293 } else if (expr.has_base() && !expr.has_index()) {
1294 AsmReg base_reg = expr.base_reg();
1295 if (std::holds_alternative<ScratchReg>(expr.base)) {
1296 scratch = std::move(std::get<ScratchReg>(expr.base));
1297 } else {
1298 (void)scratch.alloc_gp();
1299 }
1300 AsmReg dst = scratch.cur_reg();
1301 if (expr.disp != 0 && ASMIF(ADDxi, dst, base_reg, expr.disp)) {
1302 expr.disp = 0;
1303 } else if (dst != base_reg) {
1304 ASM(MOVx, dst, base_reg);
1305 }
1306 } else {
1307 TPDE_UNREACHABLE("inconsistent GenericValuePart::Expr");
1308 }
1309
1310 AsmReg dst = scratch.cur_reg();
1311 if (expr.disp != 0) {
1312 if (!ASMIF(ADDxi, dst, dst, expr.disp)) {
1313 AsmReg tmp2 = permanent_scratch_reg;
1314 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, tmp2);
1315 ASM(ADDx, dst, dst, tmp2);
1316 }
1317 }
1318
1319 gv.state = std::move(scratch);
1320 return dst;
1321}
1322
1323template <IRAdaptor Adaptor,
1324 typename Derived,
1325 template <typename, typename, typename> typename BaseTy,
1326 typename Config>
1328 u64 size, u32 align, ValuePart &res) {
1329 assert(this->stack.has_dynamic_alloca &&
1330 "function marked as not having dynamic allocas can't have alloca");
1331 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1332 size = tpde::util::align_up(size, 16);
1333 AsmReg res_reg = res.alloc_reg(this);
1334 if (size >= 0x10'0000) {
1335 auto tmp = permanent_scratch_reg;
1336 materialize_constant(size, Config::GP_BANK, 8, tmp);
1337 ASM(SUBx_uxtx, res_reg, DA_SP, tmp, 0);
1338 } else if (size >= 0x1000) {
1339 ASM(SUBxi, res_reg, DA_SP, size & 0xff'f000);
1340 if (size & 0xfff) {
1341 ASM(SUBxi, res_reg, res_reg, size & 0xfff);
1342 }
1343 } else {
1344 ASM(SUBxi, res_reg, DA_SP, size & 0xfff);
1345 }
1346
1347 if (align > 16) {
1348 // The stack pointer is always at least 16-byte aligned.
1349 ASM(ANDxi, res_reg, res_reg, ~(u64{align} - 1));
1350 }
1351
1352 if (size > 0) {
1353 ASM(MOV_SPx, DA_SP, res_reg);
1354 }
1355}
1356
1357template <IRAdaptor Adaptor,
1358 typename Derived,
1359 template <typename, typename, typename> typename BaseTy,
1360 typename Config>
1362 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) {
1363 assert(this->stack.has_dynamic_alloca &&
1364 "function marked as not having dynamic allocas can't have alloca");
1365 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1366 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(this);
1367 AsmReg res_reg = res.alloc_try_reuse(this, count);
1368
1369 if (elem_size == 0) {
1370 ASM(MOVZw, res_reg, 0);
1371 } else if ((elem_size & (elem_size - 1)) == 0) {
1372 const auto shift = util::cnt_tz(elem_size);
1373 if (shift <= 4) {
1374 ASM(SUBx_uxtx, res_reg, DA_SP, size_reg, shift);
1375 } else {
1376 ASM(LSLxi, res_reg, size_reg, shift);
1377 ASM(SUBx_uxtx, res_reg, DA_SP, res_reg, 0);
1378 }
1379 } else {
1380 auto tmp = permanent_scratch_reg;
1381 materialize_constant(elem_size, Config::GP_BANK, 8, tmp);
1382 ASM(MULx, res_reg, size_reg, tmp);
1383 ASM(SUBx_uxtx, res_reg, DA_SP, res_reg, 0);
1384 }
1385
1386 align = align > 16 ? align : 16;
1387 if (elem_size & (align - 1)) {
1388 ASM(ANDxi, res_reg, res_reg, ~(u64{align} - 1));
1389 }
1390
1391 ASM(MOV_SPx, DA_SP, res_reg);
1392}
1393
1394template <IRAdaptor Adaptor,
1395 typename Derived,
1396 template <typename, typename, typename> typename BaseTy,
1397 typename Config>
1399 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) {
1400 this->text_writer.ensure_space(5 * 4);
1401
1402 const auto const_u64 = data[0];
1403 if (bank == Config::GP_BANK) {
1404 assert(size <= 8);
1405 if (const_u64 == 0) {
1406 ASMNC(MOVZw, dst, 0);
1407 return;
1408 }
1409
1410 this->text_writer.cur_ptr() +=
1411 sizeof(u32) *
1412 de64_MOVconst(reinterpret_cast<u32 *>(this->text_writer.cur_ptr()),
1413 dst,
1414 const_u64);
1415 return;
1416 }
1417
1418 assert(bank == Config::FP_BANK);
1419 // Try instructions that take an immediate
1420 if (size == 4) {
1421 if (ASMIF(FMOVsi, dst, std::bit_cast<float>((u32)const_u64))) {
1422 return;
1423 } else if (ASMIF(MOVId, dst, static_cast<u32>(const_u64))) {
1424 return;
1425 }
1426 } else if (size == 8) {
1427 if (ASMIF(FMOVdi, dst, std::bit_cast<double>(const_u64))) {
1428 return;
1429 } else if (ASMIF(MOVId, dst, const_u64)) {
1430 return;
1431 }
1432 } else if (size == 16) {
1433 const auto high_u64 = data[1];
1434 if (const_u64 == high_u64 && ASMIF(MOVI2d, dst, const_u64)) {
1435 return;
1436 } else if (high_u64 == 0 && ASMIF(MOVId, dst, const_u64)) {
1437 return;
1438 }
1439 }
1440
1441 // We must either load through a GP register of from memory. Both cases need a
1442 // GP register in the common case. We reserve x16/x17 for cases like this.
1443 if (size <= 16) {
1444 this->register_file.mark_clobbered(permanent_scratch_reg);
1445 // Copy from a GP register
1446 // TODO: always load from memory?
1447 if (size <= 8) {
1448 materialize_constant(data, Config::GP_BANK, size, permanent_scratch_reg);
1449 if (size <= 4) {
1450 ASMNC(FMOVsw, dst, permanent_scratch_reg);
1451 } else {
1452 ASMNC(FMOVdx, dst, permanent_scratch_reg);
1453 }
1454 return;
1455 }
1456
1457 auto rodata = this->assembler.get_default_section(SectionKind::ReadOnly);
1458 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), size};
1459 auto sym = this->assembler.sym_def_data(
1460 rodata, "", raw_data, 16, Assembler::SymBinding::LOCAL);
1461 this->text_writer.ensure_space(8); // ensure contiguous instructions
1462 this->reloc_text(
1463 sym, elf::R_AARCH64_ADR_PREL_PG_HI21, this->text_writer.offset(), 0);
1464 ASMNC(ADRP, permanent_scratch_reg, 0, 0);
1465 this->reloc_text(
1466 sym, elf::R_AARCH64_LDST128_ABS_LO12_NC, this->text_writer.offset(), 0);
1467 ASMNC(LDRqu, dst, permanent_scratch_reg, 0);
1468 return;
1469 }
1470
1471 TPDE_FATAL("unable to materialize constant");
1472}
1473
1474template <IRAdaptor Adaptor,
1475 typename Derived,
1476 template <typename, typename, typename> typename BaseTy,
1477 typename Config>
1478AsmReg
1479 CompilerA64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1480 AssignmentPartRef ap, IRValueRef) {
1481 RegBank bank = ap.bank();
1482 if (bank == Config::FP_BANK && ap.part_size() > 8) {
1483 // FP registers can not in general be fixed registers, as only the lowest 8
1484 // bytes are callee-saved.
1485 return AsmReg::make_invalid();
1486 }
1487
1488 // TODO(ts): why is this in here?
1489 assert(bank.id() <= Config::NUM_BANKS);
1490 auto reg_mask = this->register_file.bank_regs(bank);
1491 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1492
1493 const auto find_possible_regs = [this,
1494 reg_mask](const u64 preferred_regs) -> u64 {
1495 // try to first get an unused reg, otherwise an unfixed reg
1496 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1497 return free_regs & preferred_regs & reg_mask;
1498 };
1499
1500 u64 possible_regs;
1501 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1502 if (!this->stack.is_leaf_function) {
1503 // we can only allocated fixed assignments from the callee-saved regs
1504 possible_regs = find_possible_regs(csr);
1505 } else {
1506 // try allocating any non-callee saved register first, except the result
1507 // registers
1508 possible_regs = find_possible_regs(~csr);
1509 if (possible_regs == 0) {
1510 // otherwise fallback to callee-saved regs
1511 possible_regs = find_possible_regs(csr);
1512 }
1513 }
1514
1515 if (possible_regs == 0) {
1516 return AsmReg::make_invalid();
1517 }
1518
1519 // try to first get an unused reg, otherwise an unfixed reg
1520 if ((possible_regs & ~this->register_file.used) != 0) {
1521 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1522 }
1523
1524 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1525 const auto reg = AsmReg{reg_id};
1526
1527 assert(!this->register_file.is_fixed(reg));
1528
1529 const auto local_idx = this->register_file.reg_local_idx(reg);
1530 const auto part = this->register_file.reg_part(reg);
1531 assert(local_idx != Base::INVALID_VAL_LOCAL_IDX);
1532
1533 auto *assignment = this->val_assignment(local_idx);
1534 auto ap = AssignmentPartRef{assignment, part};
1535 if (ap.modified()) {
1536 continue;
1537 }
1538
1539 return reg;
1540 }
1541
1542 return AsmReg::make_invalid();
1543}
1544
1545template <IRAdaptor Adaptor,
1546 typename Derived,
1547 template <typename, typename, typename> class BaseTy,
1548 typename Config>
1549typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1550 CompilerA64<Adaptor, Derived, BaseTy, Config>::invert_jump(Jump jmp) {
1551 switch (jmp.kind) {
1552 case Jump::Jeq: return jmp.change_kind(Jump::Jne);
1553 case Jump::Jne: return jmp.change_kind(Jump::Jeq);
1554 case Jump::Jcs: return jmp.change_kind(Jump::Jcc);
1555 case Jump::Jcc: return jmp.change_kind(Jump::Jcs);
1556 case Jump::Jmi: return jmp.change_kind(Jump::Jpl);
1557 case Jump::Jpl: return jmp.change_kind(Jump::Jmi);
1558 case Jump::Jvs: return jmp.change_kind(Jump::Jvc);
1559 case Jump::Jvc: return jmp.change_kind(Jump::Jvs);
1560 case Jump::Jhi: return jmp.change_kind(Jump::Jls);
1561 case Jump::Jls: return jmp.change_kind(Jump::Jhi);
1562 case Jump::Jge: return jmp.change_kind(Jump::Jlt);
1563 case Jump::Jlt: return jmp.change_kind(Jump::Jge);
1564 case Jump::Jgt: return jmp.change_kind(Jump::Jle);
1565 case Jump::Jle: return jmp.change_kind(Jump::Jgt);
1566 case Jump::jmp: return jmp;
1567 case Jump::Cbz: return jmp.change_kind(Jump::Cbnz);
1568 case Jump::Cbnz: return jmp.change_kind(Jump::Cbz);
1569 case Jump::Tbz: return jmp.change_kind(Jump::Tbnz);
1570 case Jump::Tbnz: return jmp.change_kind(Jump::Tbz);
1571 default: TPDE_UNREACHABLE("invalid jump kind");
1572 }
1573}
1574
1575template <IRAdaptor Adaptor,
1576 typename Derived,
1577 template <typename, typename, typename> typename BaseTy,
1578 typename Config>
1579typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1580 CompilerA64<Adaptor, Derived, BaseTy, Config>::swap_jump(Jump jmp) {
1581 switch (jmp.kind) {
1582 case Jump::Jeq: return jmp.change_kind(Jump::Jeq);
1583 case Jump::Jne: return jmp.change_kind(Jump::Jne);
1584 case Jump::Jcc: return jmp.change_kind(Jump::Jhi);
1585 case Jump::Jcs: return jmp.change_kind(Jump::Jls);
1586 case Jump::Jhi: return jmp.change_kind(Jump::Jcc);
1587 case Jump::Jls: return jmp.change_kind(Jump::Jcs);
1588 case Jump::Jge: return jmp.change_kind(Jump::Jle);
1589 case Jump::Jlt: return jmp.change_kind(Jump::Jgt);
1590 case Jump::Jgt: return jmp.change_kind(Jump::Jlt);
1591 case Jump::Jle: return jmp.change_kind(Jump::Jge);
1592 case Jump::jmp: return jmp;
1593 case Jump::Jmi:
1594 case Jump::Jpl:
1595 case Jump::Jvs:
1596 case Jump::Jvc:
1597 case Jump::Cbz:
1598 case Jump::Cbnz:
1599 case Jump::Tbz:
1600 case Jump::Tbnz:
1601 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1602 }
1603}
1604
1605template <IRAdaptor Adaptor,
1606 typename Derived,
1607 template <typename, typename, typename> typename BaseTy,
1608 typename Config>
1610 Jump jmp, Label target_label) {
1611 const auto is_pending = this->text_writer.label_is_pending(target_label);
1612 this->text_writer.ensure_space(4);
1613 if (jmp.kind == Jump::jmp) {
1614 if (is_pending) {
1615 ASMNC(B, 0);
1616 this->text_writer.label_ref(target_label,
1617 this->text_writer.offset() - 4,
1618 LabelFixupKind::AARCH64_BR);
1619 } else {
1620 const auto label_off = this->text_writer.label_offset(target_label);
1621 const auto cur_off = this->text_writer.offset();
1622 assert(cur_off >= label_off);
1623 const auto diff = cur_off - label_off;
1624 assert((diff & 0b11) == 0);
1625 assert(diff < 128 * 1024 * 1024);
1626
1627 ASMNC(B, -static_cast<ptrdiff_t>(diff) / 4);
1628 }
1629 return;
1630 }
1631
1632 if (jmp.kind == Jump::Cbz || jmp.kind == Jump::Cbnz) {
1633 u32 off = 0;
1634 if (!is_pending) {
1635 const auto label_off = this->text_writer.label_offset(target_label);
1636 const auto cur_off = this->text_writer.offset();
1637 assert(cur_off >= label_off);
1638 off = cur_off - label_off;
1639 assert((off & 0b11) == 0);
1640 assert(off < 128 * 1024 * 1024);
1641 }
1642
1643 if (off <= 1024 * 1024) {
1644 auto imm19 = -static_cast<ptrdiff_t>(off) / 4;
1645 if (jmp.kind == Jump::Cbz) {
1646 if (jmp.cmp_is_32) {
1647 ASMNC(CBZw, jmp.cmp_reg, imm19);
1648 } else {
1649 ASMNC(CBZx, jmp.cmp_reg, imm19);
1650 }
1651 } else {
1652 if (jmp.cmp_is_32) {
1653 ASMNC(CBNZw, jmp.cmp_reg, imm19);
1654 } else {
1655 ASMNC(CBNZx, jmp.cmp_reg, imm19);
1656 }
1657 }
1658
1659 if (is_pending) {
1660 this->text_writer.label_ref(target_label,
1661 this->text_writer.offset() - 4,
1662 LabelFixupKind::AARCH64_COND_BR);
1663 }
1664 } else {
1665 assert(!is_pending);
1666 this->text_writer.ensure_space(2 * 4);
1667
1668 if (jmp.kind == Jump::Cbz) {
1669 if (jmp.cmp_is_32) { // need to jump over 2 instructions
1670 ASMNC(CBNZw, jmp.cmp_reg, 2);
1671 } else {
1672 ASMNC(CBNZx, jmp.cmp_reg, 2);
1673 }
1674 } else {
1675 if (jmp.cmp_is_32) {
1676 ASMNC(CBZw, jmp.cmp_reg, 2);
1677 } else {
1678 ASMNC(CBZx, jmp.cmp_reg, 2);
1679 }
1680 }
1681 // + 4 since we already wrote the cb(n)z instruction
1682 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1683 }
1684 return;
1685 }
1686
1687 if (jmp.kind == Jump::Tbz || jmp.kind == Jump::Tbnz) {
1688 u32 off = 0;
1689 if (!is_pending) {
1690 const auto label_off = this->text_writer.label_offset(target_label);
1691 const auto cur_off = this->text_writer.offset();
1692 assert(cur_off >= label_off);
1693 off = cur_off - label_off;
1694 assert((off & 0b11) == 0);
1695 assert(off < 128 * 1024 * 1024);
1696 }
1697
1698 if (off <= 32 * 1024) {
1699 auto imm14 = -static_cast<ptrdiff_t>(off) / 4;
1700 if (jmp.kind == Jump::Tbz) {
1701 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, imm14);
1702 } else {
1703 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, imm14);
1704 }
1705
1706 if (is_pending) {
1707 this->text_writer.label_ref(target_label,
1708 this->text_writer.offset() - 4,
1709 LabelFixupKind::AARCH64_TEST_BR);
1710 }
1711 } else {
1712 assert(!is_pending);
1713 this->text_writer.ensure_space(2 * 4);
1714
1715 if (jmp.kind == Jump::Tbz) {
1716 // need to jump over 2 instructions
1717 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, 2);
1718 } else {
1719 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, 2);
1720 }
1721 // + 4 since we already wrote the tb(n)z instruction
1722 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1723 }
1724 return;
1725 }
1726
1727 Da64Cond cond, cond_compl;
1728 switch (jmp.kind) {
1729 case Jump::Jeq:
1730 cond = DA_EQ;
1731 cond_compl = DA_NE;
1732 break;
1733 case Jump::Jne:
1734 cond = DA_NE;
1735 cond_compl = DA_EQ;
1736 break;
1737 case Jump::Jcs:
1738 cond = DA_CS;
1739 cond_compl = DA_CC;
1740 break;
1741 case Jump::Jcc:
1742 cond = DA_CC;
1743 cond_compl = DA_CS;
1744 break;
1745 case Jump::Jmi:
1746 cond = DA_MI;
1747 cond_compl = DA_PL;
1748 break;
1749 case Jump::Jpl:
1750 cond = DA_PL;
1751 cond_compl = DA_MI;
1752 break;
1753 case Jump::Jvs:
1754 cond = DA_VS;
1755 cond_compl = DA_VC;
1756 break;
1757 case Jump::Jvc:
1758 cond = DA_VC;
1759 cond_compl = DA_VS;
1760 break;
1761 case Jump::Jhi:
1762 cond = DA_HI;
1763 cond_compl = DA_LS;
1764 break;
1765 case Jump::Jls:
1766 cond = DA_LS;
1767 cond_compl = DA_HI;
1768 break;
1769 case Jump::Jge:
1770 cond = DA_GE;
1771 cond_compl = DA_LT;
1772 break;
1773 case Jump::Jlt:
1774 cond = DA_LT;
1775 cond_compl = DA_GE;
1776 break;
1777 case Jump::Jgt:
1778 cond = DA_GT;
1779 cond_compl = DA_LE;
1780 break;
1781 case Jump::Jle:
1782 cond = DA_LE;
1783 cond_compl = DA_GT;
1784 break;
1785 default: TPDE_UNREACHABLE("invalid jump kind");
1786 }
1787
1788
1789 u32 off = 0;
1790 if (!is_pending) {
1791 const auto label_off = this->text_writer.label_offset(target_label);
1792 const auto cur_off = this->text_writer.offset();
1793 assert(cur_off >= label_off);
1794 off = cur_off - label_off;
1795 assert((off & 0b11) == 0);
1796 assert(off < 128 * 1024 * 1024);
1797 }
1798
1799 if (off <= 1024 * 1024) {
1800 ASMNC(BCOND, cond, -static_cast<ptrdiff_t>(off) / 4);
1801
1802 if (is_pending) {
1803 this->text_writer.label_ref(target_label,
1804 this->text_writer.offset() - 4,
1805 LabelFixupKind::AARCH64_COND_BR);
1806 }
1807 } else {
1808 assert(!is_pending);
1809 this->text_writer.ensure_space(2 * 4);
1810
1811 // 2 to skip over the branch following
1812 ASMNC(BCOND, cond_compl, 2);
1813 // + 4 since we already wrote the branch instruction
1814 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1815 }
1816}
1817template <IRAdaptor Adaptor,
1818 typename Derived,
1819 template <typename, typename, typename> class BaseTy,
1820 typename Config>
1822 switch (jmp.kind) {
1823 case Jump::Jeq: return DA_EQ;
1824 case Jump::Jne: return DA_NE;
1825 case Jump::Jcs: return DA_CS;
1826 case Jump::Jcc: return DA_CC;
1827 case Jump::Jmi: return DA_MI;
1828 case Jump::Jpl: return DA_PL;
1829 case Jump::Jvs: return DA_VS;
1830 case Jump::Jvc: return DA_VC;
1831 case Jump::Jhi: return DA_HI;
1832 case Jump::Jls: return DA_LS;
1833 case Jump::Jge: return DA_GE;
1834 case Jump::Jlt: return DA_LT;
1835 case Jump::Jgt: return DA_GT;
1836 case Jump::Jle: return DA_LE;
1837 case Jump::jmp: return DA_AL;
1838 default: TPDE_UNREACHABLE("invalid jump kind for conversion to Da64Cond");
1839 }
1840}
1841
1842template <IRAdaptor Adaptor,
1843 typename Derived,
1844 template <typename, typename, typename> class BaseTy,
1845 typename Config>
1847 Jump cc, AsmReg dst) {
1848 ASM(CSETw, dst, jump_to_cond(cc));
1849}
1850
1851template <IRAdaptor Adaptor,
1852 typename Derived,
1853 template <typename, typename, typename> class BaseTy,
1854 typename Config>
1856 Jump cc, AsmReg dst) {
1857 ASM(CSETMx, dst, jump_to_cond(cc));
1858}
1859template <IRAdaptor Adaptor,
1860 typename Derived,
1861 template <typename, typename, typename> class BaseTy,
1862 typename Config>
1864 Jump cc, AsmReg dst, AsmReg true_select, AsmReg false_select, bool is_64) {
1865 this->text_writer.ensure_space(4);
1866 Da64Cond cond = jump_to_cond(cc);
1867 if (is_64) {
1868 ASMNC(CSELx, dst, true_select, false_select, cond);
1869 } else {
1870 ASMNC(CSELw, dst, true_select, false_select, cond);
1871 }
1872}
1873
1874template <IRAdaptor Adaptor,
1875 typename Derived,
1876 template <typename, typename, typename> class BaseTy,
1877 typename Config>
1879 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) {
1880 assert(from < to && to <= 64);
1881 (void)to;
1882 if (sign) {
1883 if (to <= 32) {
1884 ASM(SBFXw, dst, src, 0, from);
1885 } else {
1886 ASM(SBFXx, dst, src, 0, from);
1887 }
1888 } else {
1889 if (to <= 32) {
1890 ASM(UBFXw, dst, src, 0, from);
1891 } else {
1892 ASM(UBFXx, dst, src, 0, from);
1893 }
1894 }
1895}
1896
1897template <IRAdaptor Adaptor,
1898 typename Derived,
1899 template <typename, typename, typename> typename BaseTy,
1900 typename Config>
1902 std::variant<SymRef, ValuePart> &&target,
1903 std::span<CallArg> arguments,
1904 typename Base::ValueRef *result,
1905 bool) {
1906 CCAssignerAAPCS assigner;
1907 CallBuilder cb{*derived(), assigner};
1908 for (auto &arg : arguments) {
1909 cb.add_arg(std::move(arg));
1910 }
1911 cb.call(std::move(target));
1912 if (result) {
1913 cb.add_ret(*result);
1914 }
1915}
1916
1917template <IRAdaptor Adaptor,
1918 typename Derived,
1919 template <typename, typename, typename> typename BaseTy,
1920 typename Config>
1921void CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1922 AsmReg cmp_reg, AsmReg tmp_reg, u64 case_value, bool width_is_32) {
1923 if (width_is_32) {
1924 if (!ASMIF(CMPwi, cmp_reg, case_value)) {
1925 materialize_constant(case_value, Config::GP_BANK, 4, tmp_reg);
1926 ASM(CMPw, cmp_reg, tmp_reg);
1927 }
1928 } else {
1929 if (!ASMIF(CMPxi, cmp_reg, case_value)) {
1930 materialize_constant(case_value, Config::GP_BANK, 4, tmp_reg);
1931 ASM(CMPx, cmp_reg, tmp_reg);
1932 }
1933 }
1934}
1935
1936template <IRAdaptor Adaptor,
1937 typename Derived,
1938 template <typename, typename, typename> typename BaseTy,
1939 typename Config>
1940void CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1941 Label case_label,
1942 AsmReg cmp_reg,
1943 AsmReg tmp_reg,
1944 u64 case_value,
1945 bool width_is_32) {
1946 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1947 generate_raw_jump(Jump::Jeq, case_label);
1948}
1949
1950template <IRAdaptor Adaptor,
1951 typename Derived,
1952 template <typename, typename, typename> typename BaseTy,
1953 typename Config>
1954FunctionWriterBase::JumpTable *
1955 CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_create_jump_table(
1956 Label default_label,
1957 AsmReg cmp_reg,
1958 AsmReg tmp_reg,
1959 u64 low_bound,
1960 u64 high_bound,
1961 bool width_is_32) {
1962 if (low_bound > 0) {
1963 if (width_is_32) {
1964 if (!ASMIF(SUBwi, cmp_reg, cmp_reg, low_bound)) {
1965 materialize_constant(low_bound, Config::GP_BANK, 4, tmp_reg);
1966 ASM(SUBw, cmp_reg, cmp_reg, tmp_reg);
1967 }
1968 } else {
1969 if (!ASMIF(SUBxi, cmp_reg, cmp_reg, low_bound)) {
1970 materialize_constant(low_bound, Config::GP_BANK, 4, tmp_reg);
1971 ASM(SUBx, cmp_reg, cmp_reg, tmp_reg);
1972 }
1973 }
1974 }
1975 switch_emit_cmp(cmp_reg, tmp_reg, high_bound - low_bound, width_is_32);
1976 generate_raw_jump(Jump::Jhi, default_label);
1977
1978 u64 range = high_bound - low_bound + 1;
1979 return &this->text_writer.create_jump_table(
1980 range, cmp_reg, tmp_reg, width_is_32);
1981}
1982
1983template <IRAdaptor Adaptor,
1984 typename Derived,
1985 template <typename, typename, typename> typename BaseTy,
1986 typename Config>
1987void CompilerA64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
1988 Label case_label,
1989 Label gt_label,
1990 AsmReg cmp_reg,
1991 AsmReg tmp_reg,
1992 u64 case_value,
1993 bool width_is_32) {
1994 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
1995 generate_raw_jump(Jump::Jhi, gt_label);
1996}
1997
1998template <IRAdaptor Adaptor,
1999 typename Derived,
2000 template <typename, typename, typename> typename BaseTy,
2001 typename Config>
2002CompilerA64<Adaptor, Derived, BaseTy, Config>::ScratchReg
2004 SymRef sym, TLSModel model) {
2005 switch (model) {
2006 default: // TODO: implement optimized access for non-gd-model
2007 case TLSModel::GlobalDynamic: {
2008 assert(!this->stack.is_leaf_function);
2009 this->stack.generated_call = true;
2010 ScratchReg r0_scratch{this};
2011 AsmReg r0 = r0_scratch.alloc_specific(AsmReg::R0);
2012 ScratchReg r1_scratch{this};
2013 AsmReg r1 = r1_scratch.alloc_specific(AsmReg::R1);
2014 // The call only clobbers flags, x0, x1, and lr. x0 and x1 are already fixed
2015 // in the scratch registers, so only make sure that lr isn't used otherwise.
2016 if (this->register_file.is_used(Reg{AsmReg::LR})) {
2017 this->evict_reg(Reg{AsmReg::LR});
2018 }
2019
2020 this->text_writer.ensure_space(0x18);
2021 this->reloc_text(
2022 sym, elf::R_AARCH64_TLSDESC_ADR_PAGE21, this->text_writer.offset(), 0);
2023 ASMNC(ADRP, r0, 0, 0);
2024 this->reloc_text(
2025 sym, elf::R_AARCH64_TLSDESC_LD64_LO12, this->text_writer.offset(), 0);
2026 ASMNC(LDRxu, r1, r0, 0);
2027 this->reloc_text(
2028 sym, elf::R_AARCH64_TLSDESC_ADD_LO12, this->text_writer.offset(), 0);
2029 ASMNC(ADDxi, r0, r0, 0);
2030 this->reloc_text(
2031 sym, elf::R_AARCH64_TLSDESC_CALL, this->text_writer.offset(), 0);
2032 ASMNC(BLR, r1);
2033 ASMNC(MRS, r1, 0xde82); // TPIDR_EL0
2034 // TODO: maybe return expr x0+x1.
2035 ASMNC(ADDx, r0, r1, r0);
2036 return r0_scratch;
2037 }
2038 }
2039}
2040
2041} // namespace tpde::a64
Assembler base class.
@ LOCAL
Symbol with local linkage, must be defined.
Helper class to write function text.
AArch64 AAPCS calling convention.
Helper class for building call sequences.
CallBuilder(Derived &compiler, CCAssigner &assigner)
Constructor.
Helper class to write function text for AArch64.
The IRAdaptor specifies the interface with which the IR-independent parts of the compiler interact wi...
Definition IRAdaptor.hpp:91
constexpr Jump(Kind kind, AsmReg cmp_reg, bool cmp_is_32)
Cbz/Cbnz branch.
@ Tbnz
Test single bit and branch if not zero (Xn register)
@ jmp
Unconditional jump.
@ Jge
Signed greater than or equal (N == V)
@ Jls
Unsigned lower or same (!(C == 1 && Z == 0))
@ Jhi
Unsigned higher (C == 1 && Z == 0)
@ Tbz
Test single bit and branch if zero (Xn register)
@ Jmi
Minus, negative (N == 1)
@ Jlo
Unsigned lower (C == 0)
@ Jlt
Signed less than (N != V)
@ Jgt
Signed greater than (Z == 0 && N == V)
@ Jhs
Unsigned higher or same (C == 1)
@ Jvc
No Overflow (V == 0)
@ Cbnz
Compare and branch if not zero (Wn or Xn register)
@ Jle
Signed lessthan or equal (!(Z == 0 && N == V))
@ Jcc
Carry clear (C == 0)
@ Jpl
Plus, positive or zero (N == 0)
@ Cbz
Compare and branch if zero (Wn or Xn register)
constexpr Jump(Kind kind, AsmReg cmp_reg, u8 test_bit)
Tbz/Tbnz branch.
constexpr Jump(Kind kind)
Unconditional or conditional branch based on flags.
constexpr Jump()
Unconditional branch.
Compiler mixin for targeting AArch64.
u32 func_arg_stack_add_off
Offset to the add sp, sp, XXX instruction that the argument handling uses to access stack arguments i...
void alloca_fixed(u64 size, u32 align, ValuePart &res)
Dynamic alloca of a fixed-size region.
void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width)
Bitfield insert in zero. src is not modified.
void generate_raw_intext(AsmReg dst, AsmReg src, bool sign, u32 from, u32 to)
Integer extension. src is not modified.
AsmReg permanent_scratch_reg
Permanent scratch register, e.g.
std::optional< i32 > prologue_assign_arg_part(ValuePart &&vp, CCAssignment cca)
Assign argument part.
void generate_raw_mask(Jump cc, AsmReg dst)
Set all bits of dst to 1 if cc is true, otherwise set dst to zero.
void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width)
Bitfield insert. src is not modified.
void generate_call(std::variant< SymRef, ValuePart > &&target, std::span< CallArg > arguments, typename Base::ValueRef *result, bool variable_args=false)
Generate a function call.
void generate_raw_jump(Jump jmp, Label target)
Generate jump instruction to target label.
void materialize_constant(const u64 *data, RegBank bank, u32 size, AsmReg dst)
Materialize constant into a register.
void generate_raw_set(Jump cc, AsmReg dst)
Set dst to 1 if cc is true, otherwise set it to zero.
Da64Cond jump_to_cond(Jump jmp)
Convert jump condition to disarms Da64Cond.
void prologue_begin(CCAssigner *cc_assigner)
Begin prologue, prepare for assigning arguments.
void generate_raw_select(Jump cc, AsmReg dst, AsmReg true_select, AsmReg false_select, bool is_64)
Moves true_select into dst if cc is true, otherwise move false_select into dst.
ScratchReg tls_get_addr(SymRef sym, TLSModel model)
Generate code sequence to load address of sym into a register.
void prologue_end(CCAssigner *cc_assigner)
Finish prologue.
void alloca_dynamic(u64 elem_size, ValuePart &&count, u32 align, ValuePart &res)
Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
void materialize_constant(u64 const_u64, RegBank bank, u32 size, AsmReg dst)
Materialize constant into a register.