TPDE
Loading...
Searching...
No Matches
CompilerA64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/arm64/FunctionWriterA64.hpp"
10#include "tpde/base.hpp"
11#include "tpde/util/SmallVector.hpp"
12#include "tpde/util/misc.hpp"
13
14#include <bit>
15#include <disarm64.h>
16#include <elf.h>
17
18// Helper macros for assembling in the compiler
19#if defined(ASM) || defined(ASMNC) || defined(ASMC)
20 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
21#endif
22
23/// Encode an instruction with an explicit compiler pointer
24#define ASMC(compiler, op, ...) \
25 ((compiler)->text_writer.write_inst(de64_##op(__VA_ARGS__)))
26/// Encode an instruction into this
27#define ASM(...) ASMC(this, __VA_ARGS__)
28/// Encode an instruction without checking that enough space is available
29#define ASMNC(op, ...) \
30 (this->text_writer.write_inst_unchecked(de64_##op(__VA_ARGS__)))
31/// Encode an instruction if the encoding is successful (returns true)
32#define ASMIFC(compiler, op, ...) \
33 ((compiler)->text_writer.try_write_inst(de64_##op(__VA_ARGS__)))
34/// Encode an instruction if the encoding is successful (returns true)
35#define ASMIF(...) ASMIFC(this, __VA_ARGS__)
36
37namespace tpde::a64 {
38
39struct AsmReg : Reg {
40 enum REG : u8 {
41 R0 = 0,
42 R1,
43 R2,
44 R3,
45 R4,
46 R5,
47 R6,
48 R7,
49 R8,
50 R9,
51 R10,
52 R11,
53 R12,
54 R13,
55 R14,
56 R15,
57 R16,
58 R17,
59 R18,
60 R19,
61 R20,
62 R21,
63 R22,
64 R23,
65 R24,
66 R25,
67 R26,
68 R27,
69 R28,
70 R29,
71 FP = 29,
72 R30,
73 LR = 30,
74 SP = 31,
75
76 V0 = 32,
77 V1,
78 V2,
79 V3,
80 V4,
81 V5,
82 V6,
83 V7,
84 V8,
85 V9,
86 V10,
87 V11,
88 V12,
89 V13,
90 V14,
91 V15,
92 V16,
93 V17,
94 V18,
95 V19,
96 V20,
97 V21,
98 V22,
99 V23,
100 V24,
101 V25,
102 V26,
103 V27,
104 V28,
105 V29,
106 V30,
107 V31
108 };
109
110 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
111
112 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
113
114 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
115
116 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
117 assert(id <= SP || (id >= V0 && id <= V31));
118 }
119
120 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
121 assert(id <= SP || (id >= V0 && id <= V31));
122 }
123
124 operator DA_GReg() const noexcept {
125 assert(reg_id < V0);
126 return DA_GReg{reg_id};
127 }
128
129 operator DA_GRegZR() const noexcept {
130 assert(reg_id < V0);
131 assert(reg_id != SP); // 31 means SP in our enums
132 return DA_GRegZR{reg_id};
133 }
134
135 operator DA_GRegSP() const noexcept {
136 assert(reg_id <= SP);
137 return DA_GRegSP{reg_id};
138 }
139
140 operator DA_VReg() const noexcept {
141 assert(reg_id >= V0 && reg_id <= V31);
142 return DA_VReg{static_cast<u8>(reg_id - V0)};
143 }
144};
145
146constexpr static u64
147 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
148 u64 set = 0;
149 for (const auto reg : regs) {
150 set |= 1ull << reg;
151 }
152 return set;
153}
154
155template <size_t N>
156constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
157 u64 set = 0;
158 for (const auto reg : regs) {
159 set |= 1ull << reg.id();
160 }
161 return set;
162}
163
164class CCAssignerAAPCS : public CCAssigner {
165 static constexpr CCInfo Info{
166 // we reserve SP,FP,R16 and R17 for our special use cases
167 .allocatable_regs =
168 0xFFFF'FFFF'FFFF'FFFF &
169 ~create_bitmask({AsmReg::SP, AsmReg::FP, AsmReg::R16, AsmReg::R17}),
170 // callee-saved registers
171 .callee_saved_regs = create_bitmask({
172 AsmReg::R19,
173 AsmReg::R20,
174 AsmReg::R21,
175 AsmReg::R22,
176 AsmReg::R23,
177 AsmReg::R24,
178 AsmReg::R25,
179 AsmReg::R26,
180 AsmReg::R27,
181 AsmReg::R28,
182 AsmReg::V8,
183 AsmReg::V9,
184 AsmReg::V10,
185 AsmReg::V11,
186 AsmReg::V12,
187 AsmReg::V13,
188 AsmReg::V14,
189 AsmReg::V15,
190 }),
191 .arg_regs = create_bitmask({
192 AsmReg::R0,
193 AsmReg::R1,
194 AsmReg::R2,
195 AsmReg::R3,
196 AsmReg::R4,
197 AsmReg::R5,
198 AsmReg::R6,
199 AsmReg::R7,
200 AsmReg::R8, // sret register
201 AsmReg::V0,
202 AsmReg::V1,
203 AsmReg::V2,
204 AsmReg::V3,
205 AsmReg::V4,
206 AsmReg::V5,
207 AsmReg::V6,
208 AsmReg::V7,
209 }),
210 };
211
212 // NGRN = Next General-purpose Register Number
213 // NSRN = Next SIMD/FP Register Number
214 // NSAA = Next Stack Argument Address
215 u32 ngrn = 0, nsrn = 0, nsaa = 0;
216 u32 ret_ngrn = 0, ret_nsrn = 0;
217
218public:
219 CCAssignerAAPCS() noexcept : CCAssigner(Info) {}
220
221 void reset() noexcept override {
222 ngrn = nsrn = nsaa = ret_ngrn = ret_nsrn = 0;
223 }
224
225 void assign_arg(CCAssignment &arg) noexcept override {
226 if (arg.byval) [[unlikely]] {
227 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
228 arg.stack_off = nsaa;
229 nsaa += arg.size;
230 return;
231 }
232
233 if (arg.sret) [[unlikely]] {
234 arg.reg = AsmReg{AsmReg::R8};
235 return;
236 }
237
238 if (arg.bank == RegBank{0}) {
239 if (arg.align > 8) {
240 ngrn = util::align_up(ngrn, 2);
241 }
242 if (ngrn + arg.consecutive < 8) {
243 arg.reg = Reg{AsmReg::R0 + ngrn};
244 ngrn += 1;
245 } else {
246 ngrn = 8;
247 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
248 arg.stack_off = nsaa;
249 nsaa += 8;
250 }
251 } else {
252 if (nsrn + arg.consecutive < 8) {
253 arg.reg = Reg{AsmReg::V0 + nsrn};
254 nsrn += 1;
255 } else {
256 nsrn = 8;
257 u32 size = util::align_up(arg.size, 8);
258 nsaa = util::align_up(nsaa, size);
259 arg.stack_off = nsaa;
260 nsaa += size;
261 }
262 }
263 }
264
265 u32 get_stack_size() noexcept override { return nsaa; }
266
267 void assign_ret(CCAssignment &arg) noexcept override {
268 assert(!arg.byval && !arg.sret);
269 if (arg.bank == RegBank{0}) {
270 if (arg.align > 8) {
271 ret_ngrn = util::align_up(ret_ngrn, 2);
272 }
273 if (ret_ngrn + arg.consecutive < 8) {
274 arg.reg = Reg{AsmReg::R0 + ret_ngrn};
275 ret_ngrn += 1;
276 } else {
277 assert(false);
278 }
279 } else {
280 if (ret_nsrn + arg.consecutive < 8) {
281 arg.reg = Reg{AsmReg::V0 + ret_nsrn};
282 ret_nsrn += 1;
283 } else {
284 assert(false);
285 }
286 }
287 }
288};
289
290struct PlatformConfig : CompilerConfigDefault {
291 using Assembler = AssemblerElfA64;
292 using AsmReg = tpde::a64::AsmReg;
293 using DefaultCCAssigner = CCAssignerAAPCS;
294 using FunctionWriter = FunctionWriterA64;
295
296 static constexpr RegBank GP_BANK{0};
297 static constexpr RegBank FP_BANK{1};
298 static constexpr bool FRAME_INDEXING_NEGATIVE = false;
299 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
300 static constexpr u32 NUM_BANKS = 2;
301};
302
303namespace concepts {
304template <typename T, typename Config>
305concept Compiler = tpde::Compiler<T, Config> && requires(T a) {
306 {
307 a.arg_is_int128(std::declval<typename T::IRValueRef>())
308 } -> std::convertible_to<bool>;
309
310 {
311 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
312 } -> std::convertible_to<bool>;
313};
314} // namespace concepts
315
316template <IRAdaptor Adaptor,
317 typename Derived,
318 template <typename, typename, typename> typename BaseTy =
319 CompilerBase,
320 typename Config = PlatformConfig>
321struct CompilerA64 : BaseTy<Adaptor, Derived, Config> {
322 using Base = BaseTy<Adaptor, Derived, Config>;
323
324 using IRValueRef = typename Base::IRValueRef;
325 using IRBlockRef = typename Base::IRBlockRef;
326 using IRFuncRef = typename Base::IRFuncRef;
327
328 using ScratchReg = typename Base::ScratchReg;
329 using ValuePartRef = typename Base::ValuePartRef;
330 using ValuePart = typename Base::ValuePart;
331 using GenericValuePart = typename Base::GenericValuePart;
332
333 using Assembler = typename PlatformConfig::Assembler;
334 using RegisterFile = typename Base::RegisterFile;
335
336 using CallArg = typename Base::CallArg;
337
338 using Base::derived;
339
340
341 // TODO(ts): make this dependent on the number of callee-saved regs of the
342 // current function or if there is a call in the function?
343 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
344 6};
345
346 enum CPU_FEATURES : u32 {
347 CPU_BASELINE = 0, // ARMV8.0
348 };
349
350 CPU_FEATURES cpu_feats = CPU_BASELINE;
351
352 // When handling function arguments, we need to prevent argument registers
353 // from being handed out as fixed registers
354 //
355 // Additionally, we prevent R0 and R1 from being fixed assignments to
356 // prevent issues with exception handling
357 u64 fixed_assignment_nonallocatable_mask =
358 create_bitmask({AsmReg::R0, AsmReg::R1});
359 u32 func_start_off = 0u, func_prologue_alloc = 0u, func_epilogue_alloc = 0u;
360 /// Offset to the `add sp, sp, XXX` instruction that the argument handling
361 /// uses to access stack arguments if needed
362 u32 func_arg_stack_add_off = ~0u;
363 AsmReg func_arg_stack_add_reg = AsmReg::make_invalid();
364
365 /// Permanent scratch register, e.g. to materialize constants/offsets. This is
366 /// used by materialize_constant, load_from_stack, spill_reg.
367 AsmReg permanent_scratch_reg = AsmReg::R16;
368
369 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
370 u32 reg_save_frame_off = 0;
371 util::SmallVector<u32, 8> func_ret_offs = {};
372
373 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
374 u32 stack_adjust_off = 0;
375 u32 stack_size = 0;
376 u32 stack_sub = 0;
377
378 void set_stack_used() noexcept;
379
380 public:
381 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
382 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
383
384 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
385 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
386 void call_impl(std::variant<SymRef, ValuePart> &&) noexcept;
387 void reset_stack() noexcept;
388 };
389
390 // for now, always generate an object
391 explicit CompilerA64(Adaptor *adaptor,
392 const CPU_FEATURES cpu_features = CPU_BASELINE)
393 : Base{adaptor}, cpu_feats(cpu_features) {
394 static_assert(std::is_base_of_v<CompilerA64, Derived>);
395 static_assert(concepts::Compiler<Derived, PlatformConfig>);
396 }
397
398 void start_func(u32 func_idx) noexcept;
399
400 void gen_func_prolog_and_args(CCAssigner *cc_assigner) noexcept;
401
402 // note: this has to call assembler->end_func
403 void finish_func(u32 func_idx) noexcept;
404
405 void reset() noexcept;
406
407 // helpers
408
409 void gen_func_epilog() noexcept;
410
411 void
412 spill_reg(const AsmReg reg, const u32 frame_off, const u32 size) noexcept;
413
414 void load_from_stack(AsmReg dst,
415 i32 frame_off,
416 u32 size,
417 bool sign_extend = false) noexcept;
418
419 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
420
421 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
422
423 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
424 assert(ap.stack_valid() && !ap.variable_ref());
425 return typename GenericValuePart::Expr(AsmReg::R29, ap.frame_off());
426 }
427
428 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
429
430 /// Dynamic alloca of a fixed-size region.
431 void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept;
432
433 /// Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
434 /// count must have a size of 64 bit.
435 void alloca_dynamic(u64 elem_size,
436 ValuePart &&count,
437 u32 align,
438 ValuePart &res) noexcept;
439
440 void materialize_constant(const u64 *data,
441 RegBank bank,
442 u32 size,
443 AsmReg dst) noexcept;
444 void materialize_constant(u64 const_u64,
445 RegBank bank,
446 u32 size,
447 AsmReg dst) noexcept {
448 assert(size <= sizeof(const_u64));
449 materialize_constant(&const_u64, bank, size, dst);
450 }
451
452 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef) noexcept;
453
454 struct Jump {
455 enum Kind : uint8_t {
456 Jeq,
457 Jne,
458 Jcs,
459 Jhs = Jcs,
460 Jcc,
461 Jlo = Jcc,
462 Jmi,
463 Jpl,
464 Jvs,
465 Jvc,
466 Jhi,
467 Jls,
468 Jge,
469 Jlt,
470 Jgt,
471 Jle,
472 // TDOO: consistency
473 jmp,
474 Cbz,
475 Cbnz,
476 Tbz,
477 Tbnz
478 };
479
480 Kind kind;
481 AsmReg cmp_reg;
482 bool cmp_is_32;
483 u8 test_bit;
484
485 constexpr Jump() : kind(Kind::jmp) {}
486
487 constexpr Jump(Kind kind) : kind(kind), cmp_is_32(false), test_bit(0) {
488 assert(kind != Cbz && kind != Cbnz && kind != Tbz && kind != Tbnz);
489 }
490
491 constexpr Jump(Kind kind, AsmReg cmp_reg, bool cmp_is_32)
492 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(cmp_is_32), test_bit(0) {
493 assert(kind == Cbz || kind == Cbnz);
494 }
495
496 constexpr Jump(Kind kind, AsmReg cmp_reg, u8 test_bit)
497 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(false), test_bit(test_bit) {
498 assert(kind == Tbz || kind == Tbnz);
499 }
500
501 constexpr Jump change_kind(Kind new_kind) const {
502 auto cpy = *this;
503 cpy.kind = new_kind;
504 return cpy;
505 }
506 };
507
508 Jump invert_jump(Jump jmp) noexcept;
509 Jump swap_jump(Jump jmp) noexcept;
510
511 void generate_branch_to_block(Jump jmp,
512 IRBlockRef target,
513 bool needs_split,
514 bool last_inst) noexcept;
515
516 void generate_raw_jump(Jump jmp, Label target) noexcept;
517
518 /// Convert jump condition to disarms Da64Cond.
519 /// \warning Cbz,Cbnz,Tbz and Tbnz are not supported
520 Da64Cond jump_to_cond(Jump jmp) noexcept;
521 /// Set dst to 1 if cc is true, otherwise set it to zero
522 void generate_raw_set(Jump cc, AsmReg dst) noexcept;
523 /// Set all bits of dst to 1 if cc is true, otherwise set dst to zero
524 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
525
526 /// Moves true_select into dst if cc is true,
527 /// otherwise move false_select into dst
528 void generate_raw_select(Jump cc,
529 AsmReg dst,
530 AsmReg true_select,
531 AsmReg false_select,
532 bool is_64) noexcept;
533
534 void generate_raw_intext(
535 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
536
537 /// Generate a function call
538 ///
539 /// This will get the arguments into the correct registers according to the
540 /// calling convention, clear non-callee-saved registers from the register
541 /// file (make sure you do not have any fixed assignments left over) and
542 /// fill the result registers (the u8 in the ScratchReg pair indicates the
543 /// register bank)
544 ///
545 /// Targets can be a symbol (call to PLT with relocation), or an indirect
546 /// call to a ValuePart. Result is an optional reference.
547 void generate_call(std::variant<SymRef, ValuePart> &&target,
548 std::span<CallArg> arguments,
549 typename Base::ValueRef *result,
550 bool variable_args = false);
551
552 /// Generate code sequence to load address of sym into a register. This will
553 /// generate a function call for dynamic TLS access models.
554 ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept;
555
556 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
557 return ((cpu_feats & feats) == feats);
558 }
559};
560
561template <IRAdaptor Adaptor,
562 typename Derived,
563 template <typename, typename, typename> class BaseTy,
564 typename Config>
565void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
566 set_stack_used() noexcept {
567 if (stack_adjust_off == 0) {
568 this->compiler.text_writer.ensure_space(16);
569 stack_adjust_off = this->compiler.text_writer.offset();
570 this->compiler.text_writer.cur_ptr() += 4;
571 }
572}
573
574template <IRAdaptor Adaptor,
575 typename Derived,
576 template <typename, typename, typename> class BaseTy,
577 typename Config>
578void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
579 ValuePart &vp, CCAssignment &cca) noexcept {
580 AsmReg ptr_reg = vp.load_to_reg(&this->compiler);
581 AsmReg tmp_reg = AsmReg::R16;
582
583 auto size = cca.size;
584 set_stack_used();
585 for (u32 off = 0; off < size;) {
586 if (size - off >= 8) {
587 ASMC(&this->compiler, LDRxu, tmp_reg, ptr_reg, off);
588 ASMC(&this->compiler, STRxu, tmp_reg, DA_SP, cca.stack_off + off);
589 off += 8;
590 } else if (size - off >= 4) {
591 ASMC(&this->compiler, LDRwu, tmp_reg, ptr_reg, off);
592 ASMC(&this->compiler, STRwu, tmp_reg, DA_SP, cca.stack_off + off);
593 off += 4;
594 } else if (size - off >= 2) {
595 ASMC(&this->compiler, LDRHu, tmp_reg, ptr_reg, off);
596 ASMC(&this->compiler, STRHu, tmp_reg, DA_SP, cca.stack_off + off);
597 off += 2;
598 } else {
599 ASMC(&this->compiler, LDRBu, tmp_reg, ptr_reg, off);
600 ASMC(&this->compiler, STRBu, tmp_reg, DA_SP, cca.stack_off + off);
601 off += 1;
602 }
603 }
604}
605
606template <IRAdaptor Adaptor,
607 typename Derived,
608 template <typename, typename, typename> class BaseTy,
609 typename Config>
610void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
611 ValuePart &vp, CCAssignment &cca) noexcept {
612 set_stack_used();
613
614 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
615 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
616 switch (cca.size) {
617 case 1: ASMC(&this->compiler, STRBu, reg, DA_SP, cca.stack_off); break;
618 case 2: ASMC(&this->compiler, STRHu, reg, DA_SP, cca.stack_off); break;
619 case 4: ASMC(&this->compiler, STRwu, reg, DA_SP, cca.stack_off); break;
620 case 8: ASMC(&this->compiler, STRxu, reg, DA_SP, cca.stack_off); break;
621 default: TPDE_UNREACHABLE("invalid GP reg size");
622 }
623 } else {
624 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
625 switch (cca.size) {
626 case 1: ASMC(&this->compiler, STRbu, reg, DA_SP, cca.stack_off); break;
627 case 2: ASMC(&this->compiler, STRhu, reg, DA_SP, cca.stack_off); break;
628 case 4: ASMC(&this->compiler, STRsu, reg, DA_SP, cca.stack_off); break;
629 case 8: ASMC(&this->compiler, STRdu, reg, DA_SP, cca.stack_off); break;
630 case 16: ASMC(&this->compiler, STRqu, reg, DA_SP, cca.stack_off); break;
631 default: TPDE_UNREACHABLE("invalid FP reg size");
632 }
633 }
634}
635
636template <IRAdaptor Adaptor,
637 typename Derived,
638 template <typename, typename, typename> class BaseTy,
639 typename Config>
640void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
641 std::variant<SymRef, ValuePart> &&target) noexcept {
642 u32 sub = 0;
643 if (stack_adjust_off != 0) {
644 auto *text_data = this->compiler.text_writer.begin_ptr();
645 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + stack_adjust_off);
646 u32 stack_size = this->assigner.get_stack_size();
647 sub = util::align_up(stack_size, stack_size < 0x1000 ? 0x10 : 0x1000);
648 *write_ptr = de64_SUBxi(DA_SP, DA_SP, sub);
649 } else {
650 assert(this->assigner.get_stack_size() == 0);
651 }
652
653 // For vector registers, only the lowest half is callee-saved. Evict all
654 // value parts larger than 8 bytes.
655 auto fp_regs = RegisterFile::bank_regs(Config::FP_BANK);
656 auto fp_csrs = fp_regs & this->assigner.get_ccinfo().callee_saved_regs;
657 auto used_fp_csrs = fp_csrs & this->compiler.register_file.used;
658 for (auto reg_id : util::BitSetIterator<>{used_fp_csrs}) {
659 Reg reg{reg_id};
660 ValLocalIdx local_idx = this->compiler.register_file.reg_local_idx(reg);
661 auto part = this->compiler.register_file.reg_part(reg);
662 AssignmentPartRef ap{this->compiler.val_assignment(local_idx), part};
663 if (ap.part_size() > 8) {
664 this->compiler.evict(ap);
665 }
666 }
667
668 if (auto *sym = std::get_if<SymRef>(&target)) {
669 ASMC(&this->compiler, BL, 0);
670 this->compiler.reloc_text(
671 *sym, R_AARCH64_CALL26, this->compiler.text_writer.offset() - 4);
672 } else {
673 ValuePart &tvp = std::get<ValuePart>(target);
674 if (tvp.can_salvage()) {
675 ASMC(&this->compiler, BLR, tvp.salvage(&this->compiler));
676 } else {
677 AsmReg reg = this->compiler.permanent_scratch_reg;
678 tvp.reload_into_specific_fixed(&this->compiler, reg);
679 ASMC(&this->compiler, BLR, reg);
680 }
681 tvp.reset(&this->compiler);
682 }
683
684 if (stack_adjust_off != 0) {
685 ASMC(&this->compiler, ADDxi, DA_SP, DA_SP, sub);
686 }
687}
688
689template <IRAdaptor Adaptor,
690 typename Derived,
691 template <typename, typename, typename> class BaseTy,
692 typename Config>
693void CompilerA64<Adaptor, Derived, BaseTy, Config>::start_func(
694 const u32 /*func_idx*/) noexcept {
695 this->assembler.except_begin_func();
696 this->text_writer.align(16);
697}
698
699template <IRAdaptor Adaptor,
700 typename Derived,
701 template <typename, typename, typename> typename BaseTy,
702 typename Config>
703void CompilerA64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
704 CCAssigner *cc_assigner) noexcept {
705 // prologue:
706 // sub sp, sp, #<frame_size>
707 // stp x29, x30, [sp]
708 // mov x29, sp
709 // optionally create vararg save-area
710 // reserve space for callee-saved regs
711 // 4 byte per callee-saved reg pair since for each we do
712 // stp r1, r2, [sp + XX]
713
714 // TODO(ts): for smaller functions we could enable an optimization
715 // to store the saved regs after the local variables
716 // which we could then use to not allocate space for unsaved regs
717 // which could help in the common case.
718 // However, we need to commit to this at the beginning of the function
719 // as otherwise stack accesses need to skip the reg-save area
720
721 func_ret_offs.clear();
722 func_start_off = this->text_writer.offset();
723
724 const CCInfo &cc_info = cc_assigner->get_ccinfo();
725
726 // We don't actually generate the prologue here and merely allocate space
727 // for it. Right now, we don't know which callee-saved registers will be
728 // used. While we could pad with nops, we later move the beginning of the
729 // function so that small functions don't have to execute 9 nops.
730 // See finish_func.
731 this->stack.frame_size = 16; // FP, LR
732 {
733 auto csr = cc_info.callee_saved_regs;
734 auto csr_gp = csr & this->register_file.bank_regs(Config::GP_BANK);
735 auto csr_fp = csr & this->register_file.bank_regs(Config::FP_BANK);
736 u32 gp_saves = std::popcount(csr_gp);
737 u32 fp_saves = std::popcount(csr_fp);
738 // LDP/STP can handle two registers of the same bank.
739 u32 reg_save_size = 4 * ((gp_saves + 1) / 2 + (fp_saves + 1) / 2);
740 // TODO: support CSR of Qx/Vx registers, not just Dx
741 this->stack.frame_size += util::align_up(gp_saves * 8 + fp_saves * 8, 16);
742
743 // Reserve space for sub sp, stp x29/x30, and mov x29, sp.
744 func_prologue_alloc = reg_save_size + 12;
745 this->text_writer.ensure_space(func_prologue_alloc);
746 this->text_writer.cur_ptr() += func_prologue_alloc;
747 // ldp needs the same number of instructions as stp
748 // additionally, there's an add sp, ldp x29/x30, ret (+12)
749 func_epilogue_alloc = reg_save_size + 12;
750 // extra mov sp, fp
751 func_epilogue_alloc += this->adaptor->cur_has_dynamic_alloca() ? 4 : 0;
752 }
753
754 // TODO(ts): support larger stack alignments?
755
756 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
757 reg_save_frame_off = this->stack.frame_size;
758 // We additionally store a pointer to the stack area, which we can't compute
759 // with a constant offset from the frame pointer. Add 16 bytes to maintain
760 // alignment.
761 this->stack.frame_size += 8 * 8 + 8 * 16 + 16;
762 this->text_writer.ensure_space(4 * 8);
763 ASMNC(STPx, DA_GP(0), DA_GP(1), DA_SP, reg_save_frame_off);
764 ASMNC(STPx, DA_GP(2), DA_GP(3), DA_SP, reg_save_frame_off + 16);
765 ASMNC(STPx, DA_GP(4), DA_GP(5), DA_SP, reg_save_frame_off + 32);
766 ASMNC(STPx, DA_GP(6), DA_GP(7), DA_SP, reg_save_frame_off + 48);
767 ASMNC(STPq, DA_V(0), DA_V(1), DA_SP, reg_save_frame_off + 64);
768 ASMNC(STPq, DA_V(2), DA_V(3), DA_SP, reg_save_frame_off + 96);
769 ASMNC(STPq, DA_V(4), DA_V(5), DA_SP, reg_save_frame_off + 128);
770 ASMNC(STPq, DA_V(6), DA_V(7), DA_SP, reg_save_frame_off + 160);
771 }
772
773 // Temporarily prevent argument registers from being assigned.
774 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
775 "argument registers must also be allocatable");
776 this->register_file.allocatable &= ~cc_info.arg_regs;
777
778 this->func_arg_stack_add_off = ~0u;
779
780 u32 arg_idx = 0;
781 for (const IRValueRef arg : this->adaptor->cur_args()) {
782 derived()->handle_func_arg(
783 arg_idx,
784 arg,
785 [&](ValuePart &&vp, CCAssignment cca) -> std::optional<i32> {
786 if (!cca.byval) {
787 cca.bank = vp.bank();
788 cca.size = vp.part_size();
789 }
790
791 cc_assigner->assign_arg(cca);
792
793 if (cca.reg.valid()) [[likely]] {
794 vp.set_value_reg(this, cca.reg);
795 // Mark register as allocatable as soon as it is assigned. If the
796 // argument is unused, the register will be freed immediately and
797 // can be used for later stack arguments.
798 this->register_file.allocatable |= u64{1} << cca.reg.id();
799 return {};
800 }
801
802 AsmReg dst = vp.alloc_reg(this);
803
804 this->text_writer.ensure_space(8);
805 AsmReg stack_reg = AsmReg::R17;
806 // TODO: allocate an actual scratch register for this.
807 assert(
808 !(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
809 "x17 must not be allocatable");
810 if (this->func_arg_stack_add_off == ~0u) {
811 this->func_arg_stack_add_off = this->text_writer.offset();
812 this->func_arg_stack_add_reg = stack_reg;
813 // Fixed in finish_func when frame size is known
814 ASMNC(ADDxi, stack_reg, DA_SP, 0);
815 }
816
817 if (cca.byval) {
818 ASMNC(ADDxi, dst, stack_reg, cca.stack_off);
819 } else if (cca.bank == Config::GP_BANK) {
820 switch (cca.size) {
821 case 1: ASMNC(LDRBu, dst, stack_reg, cca.stack_off); break;
822 case 2: ASMNC(LDRHu, dst, stack_reg, cca.stack_off); break;
823 case 4: ASMNC(LDRwu, dst, stack_reg, cca.stack_off); break;
824 case 8: ASMNC(LDRxu, dst, stack_reg, cca.stack_off); break;
825 default: TPDE_UNREACHABLE("invalid GP reg size");
826 }
827 } else {
828 assert(cca.bank == Config::FP_BANK);
829 switch (cca.size) {
830 case 1: ASMNC(LDRbu, dst, stack_reg, cca.stack_off); break;
831 case 2: ASMNC(LDRhu, dst, stack_reg, cca.stack_off); break;
832 case 4: ASMNC(LDRsu, dst, stack_reg, cca.stack_off); break;
833 case 8: ASMNC(LDRdu, dst, stack_reg, cca.stack_off); break;
834 case 16: ASMNC(LDRqu, dst, stack_reg, cca.stack_off); break;
835 default: TPDE_UNREACHABLE("invalid FP reg size");
836 }
837 }
838 return {};
839 });
840
841 arg_idx += 1;
842 }
843
844 // Hack: we don't know the frame size, so for a va_start(), we cannot easily
845 // compute the offset from the frame pointer. But we have a stack_reg here,
846 // so use it for var args.
847 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
848 AsmReg stack_reg = AsmReg::R17;
849 // TODO: allocate an actual scratch register for this.
850 assert(!(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
851 "x17 must not be allocatable");
852 if (this->func_arg_stack_add_off == ~0u) {
853 this->func_arg_stack_add_off = this->text_writer.offset();
854 this->func_arg_stack_add_reg = stack_reg;
855 // Fixed in finish_func when frame size is known
856 ASMC(this, ADDxi, stack_reg, DA_SP, 0);
857 }
858 ASM(ADDxi, stack_reg, stack_reg, cc_assigner->get_stack_size());
859 ASM(STRxu, stack_reg, DA_GP(29), this->reg_save_frame_off + 192);
860
861 // TODO: extract ngrn/nsrn from CCAssigner
862 // TODO: this isn't quite accurate, e.g. for (i128, i128, i128, i64, i128),
863 // this should be 8 but will end up with 7.
864 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
865 u32 ngrn = 8 - util::cnt_lz<u16>((arg_regs & 0xff) << 8 | 0x80);
866 u32 nsrn = 8 - util::cnt_lz<u16>(((arg_regs >> 32) & 0xff) << 8 | 0x80);
867 this->scalar_arg_count = ngrn;
868 this->vec_arg_count = nsrn;
869 }
870
871 this->register_file.allocatable |= cc_info.arg_regs;
872}
873
874template <IRAdaptor Adaptor,
875 typename Derived,
876 template <typename, typename, typename> typename BaseTy,
877 typename Config>
878void CompilerA64<Adaptor, Derived, BaseTy, Config>::finish_func(
879 u32 func_idx) noexcept {
880 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
881 u64 saved_regs = this->register_file.clobbered & csr;
882
883 const auto dyn_alloca = this->adaptor->cur_has_dynamic_alloca();
884 auto stack_reg = DA_SP;
885 if (dyn_alloca) {
886 stack_reg = DA_GP(29);
887 }
888
889 auto final_frame_size = util::align_up(this->stack.frame_size, 16);
890 if (final_frame_size > 4095) {
891 // round up to 4k since SUB cannot encode immediates greater than 4095
892 final_frame_size = util::align_up(final_frame_size, 4096);
893 assert(final_frame_size < 16 * 1024 * 1024);
894 }
895
896 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
897
898 {
899 // NB: code alignment factor 4, data alignment factor -8.
900 util::SmallVector<u32, 16> prologue;
901 prologue.push_back(de64_SUBxi(DA_SP, DA_SP, final_frame_size));
902 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
903 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset,
904 final_frame_size);
905 prologue.push_back(de64_STPx(DA_GP(29), DA_GP(30), DA_SP, 0));
906 prologue.push_back(de64_MOV_SPx(DA_GP(29), DA_SP));
907 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 2);
908 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
909 dwarf::a64::DW_reg_fp);
910 this->assembler.eh_write_inst(
911 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_fp, final_frame_size / 8);
912 this->assembler.eh_write_inst(
913 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_lr, final_frame_size / 8 - 1);
914
915 // Patched below
916 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
917 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
918
919 AsmReg last_reg = AsmReg::make_invalid();
920 u32 frame_off = 16;
921 for (auto reg : util::BitSetIterator{saved_regs}) {
922 if (last_reg.valid()) {
923 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
924 const auto last_bank = this->register_file.reg_bank(last_reg);
925 if (reg_bank == last_bank) {
926 if (reg_bank == Config::GP_BANK) {
927 prologue.push_back(
928 de64_STPx(last_reg, AsmReg{reg}, stack_reg, frame_off));
929 } else {
930 prologue.push_back(
931 de64_STPd(last_reg, AsmReg{reg}, stack_reg, frame_off));
932 }
933 frame_off += 16;
934 last_reg = AsmReg::make_invalid();
935 } else {
936 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
937 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
938 frame_off += 8;
939 last_reg = AsmReg{reg};
940 }
941 continue;
942 }
943
944 u8 dwarf_base = reg < 32 ? dwarf::a64::DW_reg_v0 : dwarf::a64::DW_reg_x0;
945 u8 dwarf_reg = dwarf_base + reg % 32;
946 u32 cfa_off = (final_frame_size - frame_off) / 8;
947 if ((dwarf_reg & dwarf::DWARF_CFI_PRIMARY_OPCODE_MASK) == 0) {
948 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
949 } else {
950 this->assembler.eh_write_inst(
951 dwarf::DW_CFA_offset_extended, dwarf_reg, cfa_off);
952 }
953
954 last_reg = AsmReg{reg};
955 }
956
957 if (last_reg.valid()) {
958 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
959 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
960 } else {
961 assert(this->register_file.reg_bank(last_reg) == Config::FP_BANK);
962 prologue.push_back(de64_STRdu(last_reg, stack_reg, frame_off));
963 }
964 }
965
966 assert(prologue.size() * sizeof(u32) <= func_prologue_alloc);
967
968 assert(prologue.size() < 0x4c);
969 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
970 dwarf::DW_CFA_advance_loc | (prologue.size() - 3);
971
972 // Pad with NOPs so that func_prologue_alloc - prologue.size() is a
973 // multiple if 16 (the function alignment).
974 const auto nop_count = (func_prologue_alloc / 4 - prologue.size()) % 4;
975 const auto nop = de64_NOP();
976 for (auto i = 0u; i < nop_count; ++i) {
977 prologue.push_back(nop);
978 }
979
980 // Shrink function at the beginning
981 u32 skip = util::align_down(func_prologue_alloc - prologue.size() * 4, 16);
982 std::memset(this->text_writer.begin_ptr() + func_start_off, 0, skip);
983 func_start_off += skip;
984 std::memcpy(this->text_writer.begin_ptr() + func_start_off,
985 prologue.data(),
986 prologue.size() * sizeof(u32));
987 }
988
989 if (func_arg_stack_add_off != ~0u) {
990 auto *inst_ptr = this->text_writer.begin_ptr() + func_arg_stack_add_off;
991 *reinterpret_cast<u32 *>(inst_ptr) =
992 de64_ADDxi(func_arg_stack_add_reg, DA_SP, final_frame_size);
993 }
994
995 // TODO(ts): honor cur_needs_unwind_info
996 auto func_sym = this->func_syms[func_idx];
997 auto func_sec = this->text_writer.get_sec_ref();
998
999 if (func_ret_offs.empty()) {
1000 auto func_size = this->text_writer.offset() - func_start_off;
1001 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
1002 this->assembler.eh_end_fde(fde_off, func_sym);
1003 this->assembler.except_encode_func(func_sym,
1004 this->text_writer.label_offsets.data());
1005 return;
1006 }
1007
1008 auto *text_data = this->text_writer.begin_ptr();
1009 u32 first_ret_off = func_ret_offs[0];
1010 u32 ret_size = 0;
1011 {
1012 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + first_ret_off);
1013 const auto ret_start = write_ptr;
1014 if (dyn_alloca) {
1015 *write_ptr++ = de64_MOV_SPx(DA_SP, DA_GP(29));
1016 } else {
1017 *write_ptr++ = de64_LDPx(DA_GP(29), DA_GP(30), DA_SP, 0);
1018 }
1019
1020 AsmReg last_reg = AsmReg::make_invalid();
1021 u32 frame_off = 16;
1022 for (auto reg : util::BitSetIterator{saved_regs}) {
1023 if (last_reg.valid()) {
1024 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
1025 const auto last_bank = this->register_file.reg_bank(last_reg);
1026 if (reg_bank == last_bank) {
1027 if (reg_bank == Config::GP_BANK) {
1028 *write_ptr++ =
1029 de64_LDPx(last_reg, AsmReg{reg}, stack_reg, frame_off);
1030 } else {
1031 *write_ptr++ =
1032 de64_LDPd(last_reg, AsmReg{reg}, stack_reg, frame_off);
1033 }
1034 frame_off += 16;
1035 last_reg = AsmReg::make_invalid();
1036 } else {
1037 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
1038 *write_ptr++ = de64_LDRxu(last_reg, stack_reg, frame_off);
1039 frame_off += 8;
1040 last_reg = AsmReg{reg};
1041 }
1042 continue;
1043 }
1044
1045 last_reg = AsmReg{reg};
1046 }
1047
1048 if (last_reg.valid()) {
1049 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
1050 *write_ptr++ = de64_LDRxu(last_reg, stack_reg, frame_off);
1051 } else {
1052 *write_ptr++ = de64_LDRdu(last_reg, stack_reg, frame_off);
1053 }
1054 }
1055
1056 if (dyn_alloca) {
1057 *write_ptr++ = de64_LDPx(DA_GP(29), DA_GP(30), DA_SP, 0);
1058 }
1059
1060 *write_ptr++ = de64_ADDxi(DA_SP, DA_SP, final_frame_size);
1061 *write_ptr++ = de64_RET(DA_GP(30));
1062
1063 ret_size = (write_ptr - ret_start) * 4;
1064 assert(ret_size <= func_epilogue_alloc);
1065 std::memset(write_ptr, 0, func_epilogue_alloc - ret_size);
1066 }
1067
1068 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
1069 std::memcpy(text_data + func_ret_offs[i],
1070 text_data + first_ret_off,
1071 func_epilogue_alloc);
1072 }
1073
1074 u32 func_end_ret_off = this->text_writer.offset() - func_epilogue_alloc;
1075 if (func_ret_offs.back() == func_end_ret_off) {
1076 this->text_writer.cur_ptr() -= func_epilogue_alloc - ret_size;
1077 }
1078
1079 auto func_size = this->text_writer.offset() - func_start_off;
1080 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
1081 this->assembler.eh_end_fde(fde_off, func_sym);
1082 this->assembler.except_encode_func(func_sym,
1083 this->text_writer.label_offsets.data());
1084}
1085
1086template <IRAdaptor Adaptor,
1087 typename Derived,
1088 template <typename, typename, typename> typename BaseTy,
1089 typename Config>
1090void CompilerA64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
1091 func_ret_offs.clear();
1092 Base::reset();
1093}
1094
1095template <IRAdaptor Adaptor,
1096 typename Derived,
1097 template <typename, typename, typename> typename BaseTy,
1098 typename Config>
1099void CompilerA64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
1100 // epilogue:
1101 // if !func_has_dynamic_alloca:
1102 // ldp x29, x30, [sp]
1103 // else:
1104 // mov sp, fp
1105 // for each saved reg pair:
1106 // if func_has_dynamic_alloca:
1107 // ldp r1, r2, [fp, #<off>]
1108 // else:
1109 // ldp r1, r2, [sp, #<off>]
1110 // if func_has_dynamic_alloca:
1111 // ldp x29, x30, [sp]
1112 // add sp, sp, #<frame_size>
1113 // ret
1114 //
1115 // however, since we will later patch this, we only
1116 // reserve the space for now
1117
1118 func_ret_offs.push_back(this->text_writer.offset());
1119 this->text_writer.ensure_space(func_epilogue_alloc);
1120 this->text_writer.cur_ptr() += func_epilogue_alloc;
1121}
1122
1123template <IRAdaptor Adaptor,
1124 typename Derived,
1125 template <typename, typename, typename> typename BaseTy,
1126 typename Config>
1127void CompilerA64<Adaptor, Derived, BaseTy, Config>::spill_reg(
1128 const AsmReg reg, const u32 frame_off, const u32 size) noexcept {
1129 assert((size & (size - 1)) == 0);
1130 assert(util::align_up(frame_off, size) == frame_off);
1131 // We don't support stack frames that aren't encodeable with add/sub.
1132 assert(frame_off < 0x1'000'000);
1133 this->text_writer.ensure_space(8);
1134
1135 u32 off = frame_off;
1136 auto addr_base = AsmReg{AsmReg::FP};
1137 if (off >= 0x1000 * size) [[unlikely]] {
1138 // We cannot encode the offset in the store instruction.
1139 ASMNC(ADDxi, permanent_scratch_reg, DA_GP(29), off & ~0xfff);
1140 off &= 0xfff;
1141 addr_base = permanent_scratch_reg;
1142 }
1143
1144 assert(-static_cast<i32>(frame_off) < 0);
1145 if (reg.id() <= AsmReg::R30) {
1146 switch (size) {
1147 case 1: ASMNC(STRBu, reg, addr_base, off); break;
1148 case 2: ASMNC(STRHu, reg, addr_base, off); break;
1149 case 4: ASMNC(STRwu, reg, addr_base, off); break;
1150 case 8: ASMNC(STRxu, reg, addr_base, off); break;
1151 default: TPDE_UNREACHABLE("invalid register spill size");
1152 }
1153 } else {
1154 switch (size) {
1155 case 1: ASMNC(STRbu, reg, addr_base, off); break;
1156 case 2: ASMNC(STRhu, reg, addr_base, off); break;
1157 case 4: ASMNC(STRsu, reg, addr_base, off); break;
1158 case 8: ASMNC(STRdu, reg, addr_base, off); break;
1159 case 16: ASMNC(STRqu, reg, addr_base, off); break;
1160 default: TPDE_UNREACHABLE("invalid register spill size");
1161 }
1162 }
1163}
1164
1165template <IRAdaptor Adaptor,
1166 typename Derived,
1167 template <typename, typename, typename> typename BaseTy,
1168 typename Config>
1169void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
1170 const AsmReg dst,
1171 const i32 frame_off,
1172 const u32 size,
1173 const bool sign_extend) noexcept {
1174 assert((size & (size - 1)) == 0);
1175 assert(util::align_up(frame_off, size) == frame_off);
1176 // We don't support stack frames that aren't encodeable with add/sub.
1177 assert(frame_off >= 0 && frame_off < 0x1'000'000);
1178 this->text_writer.ensure_space(8);
1179
1180 u32 off = frame_off;
1181 auto addr_base = AsmReg{AsmReg::FP};
1182 if (off >= 0x1000 * size) [[unlikely]] {
1183 // need to calculate this explicitly
1184 addr_base = dst.id() <= AsmReg::R30 ? dst : permanent_scratch_reg;
1185 ASMNC(ADDxi, addr_base, DA_GP(29), off & ~0xfff);
1186 off &= 0xfff;
1187 }
1188
1189 if (dst.id() <= AsmReg::R30) {
1190 if (!sign_extend) {
1191 switch (size) {
1192 case 1: ASMNC(LDRBu, dst, addr_base, off); break;
1193 case 2: ASMNC(LDRHu, dst, addr_base, off); break;
1194 case 4: ASMNC(LDRwu, dst, addr_base, off); break;
1195 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1196 default: TPDE_UNREACHABLE("invalid register spill size");
1197 }
1198 } else {
1199 switch (size) {
1200 case 1: ASMNC(LDRSBwu, dst, addr_base, off); break;
1201 case 2: ASMNC(LDRSHwu, dst, addr_base, off); break;
1202 case 4: ASMNC(LDRSWxu, dst, addr_base, off); break;
1203 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1204 default: TPDE_UNREACHABLE("invalid register spill size");
1205 }
1206 }
1207 return;
1208 }
1209
1210 assert(!sign_extend);
1211
1212 switch (size) {
1213 case 1: ASMNC(LDRbu, dst, addr_base, off); break;
1214 case 2: ASMNC(LDRhu, dst, addr_base, off); break;
1215 case 4: ASMNC(LDRsu, dst, addr_base, off); break;
1216 case 8: ASMNC(LDRdu, dst, addr_base, off); break;
1217 case 16: ASMNC(LDRqu, dst, addr_base, off); break;
1218 default: TPDE_UNREACHABLE("invalid register spill size");
1219 }
1220}
1221
1222template <IRAdaptor Adaptor,
1223 typename Derived,
1224 template <typename, typename, typename> typename BaseTy,
1225 typename Config>
1226void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
1227 const AsmReg dst, const AssignmentPartRef ap) noexcept {
1228 auto frame_off = ap.variable_stack_off();
1229 assert(frame_off >= 0);
1230 if (!ASMIF(ADDxi, dst, DA_GP(29), frame_off)) {
1231 materialize_constant(frame_off, Config::GP_BANK, 4, dst);
1232 ASM(ADDx_uxtw, dst, DA_GP(29), dst, 0);
1233 }
1234}
1235
1236template <IRAdaptor Adaptor,
1237 typename Derived,
1238 template <typename, typename, typename> typename BaseTy,
1239 typename Config>
1240void CompilerA64<Adaptor, Derived, BaseTy, Config>::mov(
1241 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
1242 this->text_writer.ensure_space(4);
1243 assert(dst.valid());
1244 assert(src.valid());
1245 if (dst.id() <= AsmReg::SP && src.id() <= AsmReg::SP) {
1246 assert(dst.id() != AsmReg::SP && src.id() != AsmReg::SP);
1247 if (size > 4) {
1248 ASMNC(MOVx, dst, src);
1249 } else {
1250 ASMNC(MOVw, dst, src);
1251 }
1252 } else if (dst.id() >= AsmReg::V0 && src.id() >= AsmReg::V0) {
1253 ASMNC(ORR16b, dst, src, src);
1254 } else if (dst.id() <= AsmReg::SP) {
1255 assert(dst.id() != AsmReg::SP);
1256 // gp<-vector
1257 assert(src.id() >= AsmReg::V0);
1258 assert(size <= 8);
1259 if (size <= 4) {
1260 ASMNC(FMOVws, dst, src);
1261 } else {
1262 ASMNC(FMOVxd, dst, src);
1263 }
1264 } else {
1265 // vector<-gp
1266 assert(src.id() <= AsmReg::R30);
1267 assert(dst.id() >= AsmReg::V0);
1268 assert(size <= 8);
1269 if (size <= 4) {
1270 ASMNC(FMOVsw, dst, src);
1271 } else {
1272 ASMNC(FMOVdx, dst, src);
1273 }
1274 }
1275}
1276
1277template <IRAdaptor Adaptor,
1278 typename Derived,
1279 template <typename, typename, typename> typename BaseTy,
1280 typename Config>
1281AsmReg CompilerA64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1282 GenericValuePart &gv) noexcept {
1283 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1284
1285 ScratchReg scratch{derived()};
1286 if (!expr.has_base() && !expr.has_index()) {
1287 AsmReg dst = scratch.alloc_gp();
1288 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, dst);
1289 expr.disp = 0;
1290 } else if (!expr.has_base() && expr.has_index()) {
1291 AsmReg index_reg = expr.index_reg();
1292 if (std::holds_alternative<ScratchReg>(expr.index)) {
1293 scratch = std::move(std::get<ScratchReg>(expr.index));
1294 } else {
1295 (void)scratch.alloc_gp();
1296 }
1297 AsmReg dst = scratch.cur_reg();
1298 if ((expr.scale & (expr.scale - 1)) == 0) {
1299 const auto shift = util::cnt_tz<u64>(expr.scale);
1300 ASM(LSLxi, dst, index_reg, shift);
1301 } else {
1302 AsmReg tmp2 = permanent_scratch_reg;
1303 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1304 ASM(MULx, dst, index_reg, tmp2);
1305 }
1306 } else if (expr.has_base() && expr.has_index()) {
1307 AsmReg base_reg = expr.base_reg();
1308 AsmReg index_reg = expr.index_reg();
1309 if (std::holds_alternative<ScratchReg>(expr.base)) {
1310 scratch = std::move(std::get<ScratchReg>(expr.base));
1311 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1312 scratch = std::move(std::get<ScratchReg>(expr.index));
1313 } else {
1314 (void)scratch.alloc_gp();
1315 }
1316 AsmReg dst = scratch.cur_reg();
1317 if ((expr.scale & (expr.scale - 1)) == 0) {
1318 const auto shift = util::cnt_tz<u64>(expr.scale);
1319 ASM(ADDx_lsl, dst, base_reg, index_reg, shift);
1320 } else {
1321 AsmReg tmp2 = permanent_scratch_reg;
1322 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1323 ASM(MADDx, dst, index_reg, tmp2, base_reg);
1324 }
1325 } else if (expr.has_base() && !expr.has_index()) {
1326 AsmReg base_reg = expr.base_reg();
1327 if (std::holds_alternative<ScratchReg>(expr.base)) {
1328 scratch = std::move(std::get<ScratchReg>(expr.base));
1329 } else {
1330 (void)scratch.alloc_gp();
1331 }
1332 AsmReg dst = scratch.cur_reg();
1333 if (expr.disp != 0 && ASMIF(ADDxi, dst, base_reg, expr.disp)) {
1334 expr.disp = 0;
1335 } else if (dst != base_reg) {
1336 ASM(MOVx, dst, base_reg);
1337 }
1338 } else {
1339 TPDE_UNREACHABLE("inconsistent GenericValuePart::Expr");
1340 }
1341
1342 AsmReg dst = scratch.cur_reg();
1343 if (expr.disp != 0) {
1344 if (!ASMIF(ADDxi, dst, dst, expr.disp)) {
1345 AsmReg tmp2 = permanent_scratch_reg;
1346 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, tmp2);
1347 ASM(ADDx, dst, dst, tmp2);
1348 }
1349 }
1350
1351 gv.state = std::move(scratch);
1352 return dst;
1353}
1354
1355template <IRAdaptor Adaptor,
1356 typename Derived,
1357 template <typename, typename, typename> typename BaseTy,
1358 typename Config>
1359void CompilerA64<Adaptor, Derived, BaseTy, Config>::alloca_fixed(
1360 u64 size, u32 align, ValuePart &res) noexcept {
1361 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1362 size = tpde::util::align_up(size, 16);
1363 AsmReg res_reg = res.alloc_reg(this);
1364 if (size >= 0x10'0000) {
1365 auto tmp = permanent_scratch_reg;
1366 materialize_constant(size, Config::GP_BANK, 8, tmp);
1367 ASM(SUBx_uxtx, res_reg, DA_SP, tmp, 0);
1368 } else if (size >= 0x1000) {
1369 ASM(SUBxi, res_reg, DA_SP, size & 0xff'f000);
1370 ASM(SUBxi, res_reg, res_reg, size & 0xfff);
1371 } else {
1372 ASM(SUBxi, res_reg, DA_SP, size & 0xfff);
1373 }
1374
1375 if (align > 16) {
1376 // The stack pointer is always at least 16-byte aligned.
1377 ASM(ANDxi, res_reg, res_reg, ~(u64{align} - 1));
1378 }
1379
1380 if (size > 0) {
1381 ASM(MOV_SPx, DA_SP, res_reg);
1382 }
1383}
1384
1385template <IRAdaptor Adaptor,
1386 typename Derived,
1387 template <typename, typename, typename> typename BaseTy,
1388 typename Config>
1389void CompilerA64<Adaptor, Derived, BaseTy, Config>::alloca_dynamic(
1390 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept {
1391 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1392 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(this);
1393 AsmReg res_reg = res.alloc_try_reuse(this, count);
1394
1395 if (elem_size == 0) {
1396 ASM(MOVZw, res_reg, 0);
1397 } else if ((elem_size & (elem_size - 1)) == 0) {
1398 const auto shift = util::cnt_tz(elem_size);
1399 if (shift <= 4) {
1400 ASM(SUBx_uxtx, res_reg, DA_SP, size_reg, shift);
1401 } else {
1402 ASM(LSLxi, res_reg, size_reg, shift);
1403 ASM(SUBx_uxtx, res_reg, DA_SP, res_reg, 0);
1404 }
1405 } else {
1406 auto tmp = permanent_scratch_reg;
1407 materialize_constant(elem_size, Config::GP_BANK, 8, tmp);
1408 ASM(MULx, res_reg, size_reg, tmp);
1409 ASM(SUBx_uxtx, res_reg, DA_SP, res_reg, 0);
1410 }
1411
1412 align = align > 16 ? align : 16;
1413 if (elem_size & (align - 1)) {
1414 ASM(ANDxi, res_reg, res_reg, ~(u64{align} - 1));
1415 }
1416
1417 ASM(MOV_SPx, DA_SP, res_reg);
1418}
1419
1420template <IRAdaptor Adaptor,
1421 typename Derived,
1422 template <typename, typename, typename> typename BaseTy,
1423 typename Config>
1424void CompilerA64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1425 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1426 this->text_writer.ensure_space(5 * 4);
1427
1428 const auto const_u64 = data[0];
1429 if (bank == Config::GP_BANK) {
1430 assert(size <= 8);
1431 if (const_u64 == 0) {
1432 ASMNC(MOVZw, dst, 0);
1433 return;
1434 }
1435
1436 this->text_writer.cur_ptr() +=
1437 sizeof(u32) *
1438 de64_MOVconst(reinterpret_cast<u32 *>(this->text_writer.cur_ptr()),
1439 dst,
1440 const_u64);
1441 return;
1442 }
1443
1444 assert(bank == Config::FP_BANK);
1445 // Try instructions that take an immediate
1446 if (size == 4) {
1447 if (ASMIF(FMOVsi, dst, std::bit_cast<float>((u32)const_u64))) {
1448 return;
1449 } else if (ASMIF(MOVId, dst, static_cast<u32>(const_u64))) {
1450 return;
1451 }
1452 } else if (size == 8) {
1453 if (ASMIF(FMOVdi, dst, std::bit_cast<double>(const_u64))) {
1454 return;
1455 } else if (ASMIF(MOVId, dst, const_u64)) {
1456 return;
1457 }
1458 } else if (size == 16) {
1459 const auto high_u64 = data[1];
1460 if (const_u64 == high_u64 && ASMIF(MOVI2d, dst, const_u64)) {
1461 return;
1462 } else if (high_u64 == 0 && ASMIF(MOVId, dst, const_u64)) {
1463 return;
1464 }
1465 }
1466
1467 // We must either load through a GP register of from memory. Both cases need a
1468 // GP register in the common case. We reserve x16/x17 for cases like this.
1469 if (size <= 16) {
1470 this->register_file.mark_clobbered(permanent_scratch_reg);
1471 // Copy from a GP register
1472 // TODO: always load from memory?
1473 if (size <= 8) {
1474 materialize_constant(data, Config::GP_BANK, size, permanent_scratch_reg);
1475 if (size <= 4) {
1476 ASMNC(FMOVsw, dst, permanent_scratch_reg);
1477 } else {
1478 ASMNC(FMOVdx, dst, permanent_scratch_reg);
1479 }
1480 return;
1481 }
1482
1483 auto rodata = this->assembler.get_data_section(true, false);
1484 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), size};
1485 auto sym = this->assembler.sym_def_data(
1486 rodata, "", raw_data, 16, Assembler::SymBinding::LOCAL);
1487 this->text_writer.ensure_space(8); // ensure contiguous instructions
1488 this->reloc_text(
1489 sym, R_AARCH64_ADR_PREL_PG_HI21, this->text_writer.offset(), 0);
1490 ASMNC(ADRP, permanent_scratch_reg, 0, 0);
1491 this->reloc_text(
1492 sym, R_AARCH64_LDST128_ABS_LO12_NC, this->text_writer.offset(), 0);
1493 ASMNC(LDRqu, dst, permanent_scratch_reg, 0);
1494 return;
1495 }
1496
1497 TPDE_FATAL("unable to materialize constant");
1498}
1499
1500template <IRAdaptor Adaptor,
1501 typename Derived,
1502 template <typename, typename, typename> typename BaseTy,
1503 typename Config>
1504AsmReg
1505 CompilerA64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1506 AssignmentPartRef ap, IRValueRef) noexcept {
1507 RegBank bank = ap.bank();
1508 if (bank == Config::FP_BANK && ap.part_size() > 8) {
1509 // FP registers can not in general be fixed registers, as only the lowest 8
1510 // bytes are callee-saved.
1511 return AsmReg::make_invalid();
1512 }
1513
1514 // TODO(ts): why is this in here?
1515 assert(bank.id() <= Config::NUM_BANKS);
1516 auto reg_mask = this->register_file.bank_regs(bank);
1517 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1518
1519 const auto find_possible_regs = [this,
1520 reg_mask](const u64 preferred_regs) -> u64 {
1521 // try to first get an unused reg, otherwise an unfixed reg
1522 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1523 return free_regs & preferred_regs & reg_mask;
1524 };
1525
1526 u64 possible_regs;
1527 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1528 if (derived()->cur_func_may_emit_calls()) {
1529 // we can only allocated fixed assignments from the callee-saved regs
1530 possible_regs = find_possible_regs(csr);
1531 } else {
1532 // try allocating any non-callee saved register first, except the result
1533 // registers
1534 possible_regs = find_possible_regs(~csr);
1535 if (possible_regs == 0) {
1536 // otherwise fallback to callee-saved regs
1537 possible_regs = find_possible_regs(csr);
1538 }
1539 }
1540
1541 if (possible_regs == 0) {
1542 return AsmReg::make_invalid();
1543 }
1544
1545 // try to first get an unused reg, otherwise an unfixed reg
1546 if ((possible_regs & ~this->register_file.used) != 0) {
1547 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1548 }
1549
1550 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1551 const auto reg = AsmReg{reg_id};
1552
1553 assert(!this->register_file.is_fixed(reg));
1554
1555 const auto local_idx = this->register_file.reg_local_idx(reg);
1556 const auto part = this->register_file.reg_part(reg);
1557 assert(local_idx != Base::INVALID_VAL_LOCAL_IDX);
1558
1559 auto *assignment = this->val_assignment(local_idx);
1560 auto ap = AssignmentPartRef{assignment, part};
1561 if (ap.modified()) {
1562 continue;
1563 }
1564
1565 return reg;
1566 }
1567
1568 return AsmReg::make_invalid();
1569}
1570
1571template <IRAdaptor Adaptor,
1572 typename Derived,
1573 template <typename, typename, typename> class BaseTy,
1574 typename Config>
1575typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1576 CompilerA64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1577 Jump jmp) noexcept {
1578 switch (jmp.kind) {
1579 case Jump::Jeq: return jmp.change_kind(Jump::Jne);
1580 case Jump::Jne: return jmp.change_kind(Jump::Jeq);
1581 case Jump::Jcs: return jmp.change_kind(Jump::Jcc);
1582 case Jump::Jcc: return jmp.change_kind(Jump::Jcs);
1583 case Jump::Jmi: return jmp.change_kind(Jump::Jpl);
1584 case Jump::Jpl: return jmp.change_kind(Jump::Jmi);
1585 case Jump::Jvs: return jmp.change_kind(Jump::Jvc);
1586 case Jump::Jvc: return jmp.change_kind(Jump::Jvs);
1587 case Jump::Jhi: return jmp.change_kind(Jump::Jls);
1588 case Jump::Jls: return jmp.change_kind(Jump::Jhi);
1589 case Jump::Jge: return jmp.change_kind(Jump::Jlt);
1590 case Jump::Jlt: return jmp.change_kind(Jump::Jge);
1591 case Jump::Jgt: return jmp.change_kind(Jump::Jle);
1592 case Jump::Jle: return jmp.change_kind(Jump::Jgt);
1593 case Jump::jmp: return jmp;
1594 case Jump::Cbz: return jmp.change_kind(Jump::Cbnz);
1595 case Jump::Cbnz: return jmp.change_kind(Jump::Cbz);
1596 case Jump::Tbz: return jmp.change_kind(Jump::Tbnz);
1597 case Jump::Tbnz: return jmp.change_kind(Jump::Tbz);
1598 default: TPDE_UNREACHABLE("invalid jump kind");
1599 }
1600}
1601
1602template <IRAdaptor Adaptor,
1603 typename Derived,
1604 template <typename, typename, typename> typename BaseTy,
1605 typename Config>
1606typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1607 CompilerA64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1608 Jump jmp) noexcept {
1609 switch (jmp.kind) {
1610 case Jump::Jeq: return jmp.change_kind(Jump::Jeq);
1611 case Jump::Jne: return jmp.change_kind(Jump::Jne);
1612 case Jump::Jcc: return jmp.change_kind(Jump::Jhi);
1613 case Jump::Jcs: return jmp.change_kind(Jump::Jls);
1614 case Jump::Jhi: return jmp.change_kind(Jump::Jcc);
1615 case Jump::Jls: return jmp.change_kind(Jump::Jcs);
1616 case Jump::Jge: return jmp.change_kind(Jump::Jle);
1617 case Jump::Jlt: return jmp.change_kind(Jump::Jgt);
1618 case Jump::Jgt: return jmp.change_kind(Jump::Jlt);
1619 case Jump::Jle: return jmp.change_kind(Jump::Jge);
1620 case Jump::jmp: return jmp;
1621 case Jump::Jmi:
1622 case Jump::Jpl:
1623 case Jump::Jvs:
1624 case Jump::Jvc:
1625 case Jump::Cbz:
1626 case Jump::Cbnz:
1627 case Jump::Tbz:
1628 case Jump::Tbnz:
1629 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1630 }
1631}
1632
1633template <IRAdaptor Adaptor,
1634 typename Derived,
1635 template <typename, typename, typename> typename BaseTy,
1636 typename Config>
1637void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1638 const Jump jmp,
1639 IRBlockRef target,
1640 const bool needs_split,
1641 const bool last_inst) noexcept {
1642 const auto target_idx = this->analyzer.block_idx(target);
1643 if (!needs_split || jmp.kind == Jump::jmp) {
1644 this->derived()->move_to_phi_nodes(target_idx);
1645
1646 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1647 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1648 }
1649 } else {
1650 auto tmp_label = this->text_writer.label_create();
1651 generate_raw_jump(invert_jump(jmp), tmp_label);
1652
1653 this->derived()->move_to_phi_nodes(target_idx);
1654
1655 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1656
1657 this->label_place(tmp_label);
1658 }
1659}
1660
1661template <IRAdaptor Adaptor,
1662 typename Derived,
1663 template <typename, typename, typename> typename BaseTy,
1664 typename Config>
1665void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1666 Jump jmp, Label target_label) noexcept {
1667 const auto is_pending = this->text_writer.label_is_pending(target_label);
1668 this->text_writer.ensure_space(4);
1669 if (jmp.kind == Jump::jmp) {
1670 if (is_pending) {
1671 ASMNC(B, 0);
1672 this->text_writer.label_ref(target_label,
1673 this->text_writer.offset() - 4,
1674 LabelFixupKind::AARCH64_BR);
1675 } else {
1676 const auto label_off = this->text_writer.label_offset(target_label);
1677 const auto cur_off = this->text_writer.offset();
1678 assert(cur_off >= label_off);
1679 const auto diff = cur_off - label_off;
1680 assert((diff & 0b11) == 0);
1681 assert(diff < 128 * 1024 * 1024);
1682
1683 ASMNC(B, -static_cast<ptrdiff_t>(diff) / 4);
1684 }
1685 return;
1686 }
1687
1688 if (jmp.kind == Jump::Cbz || jmp.kind == Jump::Cbnz) {
1689 u32 off = 0;
1690 if (!is_pending) {
1691 const auto label_off = this->text_writer.label_offset(target_label);
1692 const auto cur_off = this->text_writer.offset();
1693 assert(cur_off >= label_off);
1694 off = cur_off - label_off;
1695 assert((off & 0b11) == 0);
1696 assert(off < 128 * 1024 * 1024);
1697 }
1698
1699 if (off <= 1024 * 1024) {
1700 auto imm19 = -static_cast<ptrdiff_t>(off) / 4;
1701 if (jmp.kind == Jump::Cbz) {
1702 if (jmp.cmp_is_32) {
1703 ASMNC(CBZw, jmp.cmp_reg, imm19);
1704 } else {
1705 ASMNC(CBZx, jmp.cmp_reg, imm19);
1706 }
1707 } else {
1708 if (jmp.cmp_is_32) {
1709 ASMNC(CBNZw, jmp.cmp_reg, imm19);
1710 } else {
1711 ASMNC(CBNZx, jmp.cmp_reg, imm19);
1712 }
1713 }
1714
1715 if (is_pending) {
1716 this->text_writer.label_ref(target_label,
1717 this->text_writer.offset() - 4,
1718 LabelFixupKind::AARCH64_COND_BR);
1719 }
1720 } else {
1721 assert(!is_pending);
1722 this->text_writer.ensure_space(2 * 4);
1723
1724 if (jmp.kind == Jump::Cbz) {
1725 if (jmp.cmp_is_32) { // need to jump over 2 instructions
1726 ASMNC(CBNZw, jmp.cmp_reg, 2);
1727 } else {
1728 ASMNC(CBNZx, jmp.cmp_reg, 2);
1729 }
1730 } else {
1731 if (jmp.cmp_is_32) {
1732 ASMNC(CBZw, jmp.cmp_reg, 2);
1733 } else {
1734 ASMNC(CBZx, jmp.cmp_reg, 2);
1735 }
1736 }
1737 // + 4 since we already wrote the cb(n)z instruction
1738 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1739 }
1740 return;
1741 }
1742
1743 if (jmp.kind == Jump::Tbz || jmp.kind == Jump::Tbnz) {
1744 u32 off = 0;
1745 if (!is_pending) {
1746 const auto label_off = this->text_writer.label_offset(target_label);
1747 const auto cur_off = this->text_writer.offset();
1748 assert(cur_off >= label_off);
1749 off = cur_off - label_off;
1750 assert((off & 0b11) == 0);
1751 assert(off < 128 * 1024 * 1024);
1752 }
1753
1754 if (off <= 32 * 1024) {
1755 auto imm14 = -static_cast<ptrdiff_t>(off) / 4;
1756 if (jmp.kind == Jump::Tbz) {
1757 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, imm14);
1758 } else {
1759 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, imm14);
1760 }
1761
1762 if (is_pending) {
1763 this->text_writer.label_ref(target_label,
1764 this->text_writer.offset() - 4,
1765 LabelFixupKind::AARCH64_TEST_BR);
1766 }
1767 } else {
1768 assert(!is_pending);
1769 this->text_writer.ensure_space(2 * 4);
1770
1771 if (jmp.kind == Jump::Tbz) {
1772 // need to jump over 2 instructions
1773 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, 2);
1774 } else {
1775 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, 2);
1776 }
1777 // + 4 since we already wrote the tb(n)z instruction
1778 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1779 }
1780 return;
1781 }
1782
1783 Da64Cond cond, cond_compl;
1784 switch (jmp.kind) {
1785 case Jump::Jeq:
1786 cond = DA_EQ;
1787 cond_compl = DA_NE;
1788 break;
1789 case Jump::Jne:
1790 cond = DA_NE;
1791 cond_compl = DA_EQ;
1792 break;
1793 case Jump::Jcs:
1794 cond = DA_CS;
1795 cond_compl = DA_CC;
1796 break;
1797 case Jump::Jcc:
1798 cond = DA_CC;
1799 cond_compl = DA_CS;
1800 break;
1801 case Jump::Jmi:
1802 cond = DA_MI;
1803 cond_compl = DA_PL;
1804 break;
1805 case Jump::Jpl:
1806 cond = DA_PL;
1807 cond_compl = DA_MI;
1808 break;
1809 case Jump::Jvs:
1810 cond = DA_VS;
1811 cond_compl = DA_VC;
1812 break;
1813 case Jump::Jvc:
1814 cond = DA_VC;
1815 cond_compl = DA_VS;
1816 break;
1817 case Jump::Jhi:
1818 cond = DA_HI;
1819 cond_compl = DA_LS;
1820 break;
1821 case Jump::Jls:
1822 cond = DA_LS;
1823 cond_compl = DA_HI;
1824 break;
1825 case Jump::Jge:
1826 cond = DA_GE;
1827 cond_compl = DA_LT;
1828 break;
1829 case Jump::Jlt:
1830 cond = DA_LT;
1831 cond_compl = DA_GE;
1832 break;
1833 case Jump::Jgt:
1834 cond = DA_GT;
1835 cond_compl = DA_LE;
1836 break;
1837 case Jump::Jle:
1838 cond = DA_LE;
1839 cond_compl = DA_GT;
1840 break;
1841 default: TPDE_UNREACHABLE("invalid jump kind");
1842 }
1843
1844
1845 u32 off = 0;
1846 if (!is_pending) {
1847 const auto label_off = this->text_writer.label_offset(target_label);
1848 const auto cur_off = this->text_writer.offset();
1849 assert(cur_off >= label_off);
1850 off = cur_off - label_off;
1851 assert((off & 0b11) == 0);
1852 assert(off < 128 * 1024 * 1024);
1853 }
1854
1855 if (off <= 1024 * 1024) {
1856 ASMNC(BCOND, cond, -static_cast<ptrdiff_t>(off) / 4);
1857
1858 if (is_pending) {
1859 this->text_writer.label_ref(target_label,
1860 this->text_writer.offset() - 4,
1861 LabelFixupKind::AARCH64_COND_BR);
1862 }
1863 } else {
1864 assert(!is_pending);
1865 this->text_writer.ensure_space(2 * 4);
1866
1867 // 2 to skip over the branch following
1868 ASMNC(BCOND, cond_compl, 2);
1869 // + 4 since we already wrote the branch instruction
1870 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1871 }
1872}
1873template <IRAdaptor Adaptor,
1874 typename Derived,
1875 template <typename, typename, typename> class BaseTy,
1876 typename Config>
1877Da64Cond CompilerA64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(
1878 Jump jmp) noexcept {
1879 switch (jmp.kind) {
1880 case Jump::Jeq: return DA_EQ;
1881 case Jump::Jne: return DA_NE;
1882 case Jump::Jcs: return DA_CS;
1883 case Jump::Jcc: return DA_CC;
1884 case Jump::Jmi: return DA_MI;
1885 case Jump::Jpl: return DA_PL;
1886 case Jump::Jvs: return DA_VS;
1887 case Jump::Jvc: return DA_VC;
1888 case Jump::Jhi: return DA_HI;
1889 case Jump::Jls: return DA_LS;
1890 case Jump::Jge: return DA_GE;
1891 case Jump::Jlt: return DA_LT;
1892 case Jump::Jgt: return DA_GT;
1893 case Jump::Jle: return DA_LE;
1894 case Jump::jmp: return DA_AL;
1895 default: TPDE_UNREACHABLE("invalid jump kind for conversion to Da64Cond");
1896 }
1897}
1898
1899template <IRAdaptor Adaptor,
1900 typename Derived,
1901 template <typename, typename, typename> class BaseTy,
1902 typename Config>
1903void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1904 Jump cc, AsmReg dst) noexcept {
1905 ASM(CSETw, dst, jump_to_cond(cc));
1906}
1907
1908template <IRAdaptor Adaptor,
1909 typename Derived,
1910 template <typename, typename, typename> class BaseTy,
1911 typename Config>
1912void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1913 Jump cc, AsmReg dst) noexcept {
1914 ASM(CSETMx, dst, jump_to_cond(cc));
1915}
1916template <IRAdaptor Adaptor,
1917 typename Derived,
1918 template <typename, typename, typename> class BaseTy,
1919 typename Config>
1920void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_select(
1921 Jump cc,
1922 AsmReg dst,
1923 AsmReg true_select,
1924 AsmReg false_select,
1925 bool is_64) noexcept {
1926 this->text_writer.ensure_space(4);
1927 Da64Cond cond = jump_to_cond(cc);
1928 if (is_64) {
1929 ASMNC(CSELx, dst, true_select, false_select, cond);
1930 } else {
1931 ASMNC(CSELw, dst, true_select, false_select, cond);
1932 }
1933}
1934
1935template <IRAdaptor Adaptor,
1936 typename Derived,
1937 template <typename, typename, typename> class BaseTy,
1938 typename Config>
1939void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1940 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1941 assert(from < to && to <= 64);
1942 (void)to;
1943 if (sign) {
1944 if (to <= 32) {
1945 ASM(SBFXw, dst, src, 0, from);
1946 } else {
1947 ASM(SBFXx, dst, src, 0, from);
1948 }
1949 } else {
1950 if (to <= 32) {
1951 ASM(UBFXw, dst, src, 0, from);
1952 } else {
1953 ASM(UBFXx, dst, src, 0, from);
1954 }
1955 }
1956}
1957
1958template <IRAdaptor Adaptor,
1959 typename Derived,
1960 template <typename, typename, typename> typename BaseTy,
1961 typename Config>
1962void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_call(
1963 std::variant<SymRef, ValuePart> &&target,
1964 std::span<CallArg> arguments,
1965 typename Base::ValueRef *result,
1966 bool) {
1967 CCAssignerAAPCS assigner;
1968 CallBuilder cb{*derived(), assigner};
1969 for (auto &arg : arguments) {
1970 cb.add_arg(std::move(arg));
1971 }
1972 cb.call(std::move(target));
1973 if (result) {
1974 cb.add_ret(*result);
1975 }
1976}
1977
1978template <IRAdaptor Adaptor,
1979 typename Derived,
1980 template <typename, typename, typename> typename BaseTy,
1981 typename Config>
1982CompilerA64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1983 CompilerA64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1984 SymRef sym, TLSModel model) noexcept {
1985 switch (model) {
1986 default: // TODO: implement optimized access for non-gd-model
1987 case TLSModel::GlobalDynamic: {
1988 ScratchReg r0_scratch{this};
1989 AsmReg r0 = r0_scratch.alloc_specific(AsmReg::R0);
1990 ScratchReg r1_scratch{this};
1991 AsmReg r1 = r1_scratch.alloc_specific(AsmReg::R1);
1992 // The call only clobbers flags, x0, x1, and lr. x0 and x1 are already fixed
1993 // in the scratch registers, so only make sure that lr isn't used otherwise.
1994 if (this->register_file.is_used(Reg{AsmReg::LR})) {
1995 this->evict_reg(Reg{AsmReg::LR});
1996 }
1997
1998 this->text_writer.ensure_space(0x18);
1999 this->reloc_text(
2000 sym, R_AARCH64_TLSDESC_ADR_PAGE21, this->text_writer.offset(), 0);
2001 ASMNC(ADRP, r0, 0, 0);
2002 this->reloc_text(
2003 sym, R_AARCH64_TLSDESC_LD64_LO12, this->text_writer.offset(), 0);
2004 ASMNC(LDRxu, r1, r0, 0);
2005 this->reloc_text(
2006 sym, R_AARCH64_TLSDESC_ADD_LO12, this->text_writer.offset(), 0);
2007 ASMNC(ADDxi, r0, r0, 0);
2008 this->reloc_text(
2009 sym, R_AARCH64_TLSDESC_CALL, this->text_writer.offset(), 0);
2010 ASMNC(BLR, r1);
2011 ASMNC(MRS, r1, 0xde82); // TPIDR_EL0
2012 // TODO: maybe return expr x0+x1.
2013 ASMNC(ADDx, r0, r1, r0);
2014 return r0_scratch;
2015 }
2016 }
2017}
2018
2019} // namespace tpde::a64