TPDE
Loading...
Searching...
No Matches
CompilerA64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "AssemblerElfA64.hpp"
7#include "tpde/CompilerBase.hpp"
8#include "tpde/base.hpp"
9#include "tpde/util/SmallVector.hpp"
10#include "tpde/util/misc.hpp"
11
12#include <bit>
13#include <disarm64.h>
14#include <elf.h>
15
16// Helper macros for assembling in the compiler
17#if defined(ASM) || defined(ASMNC) || defined(ASMC)
18 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
19#endif
20
21/// Encode an instruction with an explicit compiler pointer
22#define ASMC(compiler, op, ...) \
23 ((compiler)->text_writer.write_inst(de64_##op(__VA_ARGS__)))
24/// Encode an instruction into this
25#define ASM(...) ASMC(this, __VA_ARGS__)
26/// Encode an instruction without checking that enough space is available
27#define ASMNC(op, ...) \
28 (this->text_writer.write_inst_unchecked(de64_##op(__VA_ARGS__)))
29/// Encode an instruction if the encoding is successful (returns true)
30#define ASMIFC(compiler, op, ...) \
31 ((compiler)->text_writer.try_write_inst(de64_##op(__VA_ARGS__)))
32/// Encode an instruction if the encoding is successful (returns true)
33#define ASMIF(...) ASMIFC(this, __VA_ARGS__)
34
35namespace tpde::a64 {
36
37struct AsmReg : Reg {
38 enum REG : u8 {
39 R0 = 0,
40 R1,
41 R2,
42 R3,
43 R4,
44 R5,
45 R6,
46 R7,
47 R8,
48 R9,
49 R10,
50 R11,
51 R12,
52 R13,
53 R14,
54 R15,
55 R16,
56 R17,
57 R18,
58 R19,
59 R20,
60 R21,
61 R22,
62 R23,
63 R24,
64 R25,
65 R26,
66 R27,
67 R28,
68 R29,
69 FP = 29,
70 R30,
71 LR = 30,
72 SP = 31,
73
74 V0 = 32,
75 V1,
76 V2,
77 V3,
78 V4,
79 V5,
80 V6,
81 V7,
82 V8,
83 V9,
84 V10,
85 V11,
86 V12,
87 V13,
88 V14,
89 V15,
90 V16,
91 V17,
92 V18,
93 V19,
94 V20,
95 V21,
96 V22,
97 V23,
98 V24,
99 V25,
100 V26,
101 V27,
102 V28,
103 V29,
104 V30,
105 V31
106 };
107
108 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
109
110 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
111
112 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
113
114 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
115 assert(id <= SP || (id >= V0 && id <= V31));
116 }
117
118 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
119 assert(id <= SP || (id >= V0 && id <= V31));
120 }
121
122 operator DA_GReg() const noexcept {
123 assert(reg_id < V0);
124 return DA_GReg{reg_id};
125 }
126
127 operator DA_GRegZR() const noexcept {
128 assert(reg_id < V0);
129 assert(reg_id != SP); // 31 means SP in our enums
130 return DA_GRegZR{reg_id};
131 }
132
133 operator DA_GRegSP() const noexcept {
134 assert(reg_id <= SP);
135 return DA_GRegSP{reg_id};
136 }
137
138 operator DA_VReg() const noexcept {
139 assert(reg_id >= V0 && reg_id <= V31);
140 return DA_VReg{static_cast<u8>(reg_id - V0)};
141 }
142};
143
144constexpr static u64
145 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
146 u64 set = 0;
147 for (const auto reg : regs) {
148 set |= 1ull << reg;
149 }
150 return set;
151}
152
153template <size_t N>
154constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
155 u64 set = 0;
156 for (const auto reg : regs) {
157 set |= 1ull << reg.id();
158 }
159 return set;
160}
161
162class CCAssignerAAPCS : public CCAssigner {
163 static constexpr CCInfo Info{
164 // we reserve SP,FP,R16 and R17 for our special use cases
165 .allocatable_regs =
166 0xFFFF'FFFF'FFFF'FFFF &
167 ~create_bitmask({AsmReg::SP, AsmReg::FP, AsmReg::R16, AsmReg::R17}),
168 // callee-saved registers
169 .callee_saved_regs = create_bitmask({
170 AsmReg::R19,
171 AsmReg::R20,
172 AsmReg::R21,
173 AsmReg::R22,
174 AsmReg::R23,
175 AsmReg::R24,
176 AsmReg::R25,
177 AsmReg::R26,
178 AsmReg::R27,
179 AsmReg::R28,
180 AsmReg::V8,
181 AsmReg::V9,
182 AsmReg::V10,
183 AsmReg::V11,
184 AsmReg::V12,
185 AsmReg::V13,
186 AsmReg::V14,
187 AsmReg::V15,
188 }),
189 .arg_regs = create_bitmask({
190 AsmReg::R0,
191 AsmReg::R1,
192 AsmReg::R2,
193 AsmReg::R3,
194 AsmReg::R4,
195 AsmReg::R5,
196 AsmReg::R6,
197 AsmReg::R7,
198 AsmReg::R8, // sret register
199 AsmReg::V0,
200 AsmReg::V1,
201 AsmReg::V2,
202 AsmReg::V3,
203 AsmReg::V4,
204 AsmReg::V5,
205 AsmReg::V6,
206 AsmReg::V7,
207 }),
208 };
209
210 // NGRN = Next General-purpose Register Number
211 // NSRN = Next SIMD/FP Register Number
212 // NSAA = Next Stack Argument Address
213 u32 ngrn = 0, nsrn = 0, nsaa = 0;
214 u32 ret_ngrn = 0, ret_nsrn = 0;
215
216public:
217 CCAssignerAAPCS() noexcept : CCAssigner(Info) {}
218
219 void reset() noexcept override {
220 ngrn = nsrn = nsaa = ret_ngrn = ret_nsrn = 0;
221 }
222
223 void assign_arg(CCAssignment &arg) noexcept override {
224 if (arg.byval) [[unlikely]] {
225 nsaa = util::align_up(nsaa, arg.byval_align < 8 ? 8 : arg.byval_align);
226 arg.stack_off = nsaa;
227 nsaa += arg.byval_size;
228 return;
229 }
230
231 if (arg.sret) [[unlikely]] {
232 arg.reg = AsmReg{AsmReg::R8};
233 return;
234 }
235
236 if (arg.bank == RegBank{0}) {
237 if (arg.align > 8) {
238 ngrn = util::align_up(ngrn, 2);
239 }
240 if (ngrn + arg.consecutive < 8) {
241 arg.reg = Reg{AsmReg::R0 + ngrn};
242 ngrn += 1;
243 } else {
244 ngrn = 8;
245 nsaa = util::align_up(nsaa, arg.align < 8 ? 8 : arg.align);
246 arg.stack_off = nsaa;
247 nsaa += 8;
248 }
249 } else {
250 if (nsrn + arg.consecutive < 8) {
251 arg.reg = Reg{AsmReg::V0 + nsrn};
252 nsrn += 1;
253 } else {
254 nsrn = 8;
255 u32 size = util::align_up(arg.size, 8);
256 nsaa = util::align_up(nsaa, size);
257 arg.stack_off = nsaa;
258 nsaa += size;
259 }
260 }
261 }
262
263 u32 get_stack_size() noexcept override { return nsaa; }
264
265 void assign_ret(CCAssignment &arg) noexcept override {
266 assert(!arg.byval && !arg.sret);
267 if (arg.bank == RegBank{0}) {
268 if (arg.align > 8) {
269 ret_ngrn = util::align_up(ret_ngrn, 2);
270 }
271 if (ret_ngrn + arg.consecutive < 8) {
272 arg.reg = Reg{AsmReg::R0 + ret_ngrn};
273 ret_ngrn += 1;
274 } else {
275 assert(false);
276 }
277 } else {
278 if (ret_nsrn + arg.consecutive < 8) {
279 arg.reg = Reg{AsmReg::V0 + ret_nsrn};
280 ret_nsrn += 1;
281 } else {
282 assert(false);
283 }
284 }
285 }
286};
287
288struct PlatformConfig : CompilerConfigDefault {
289 using Assembler = AssemblerElfA64;
290 using AsmReg = tpde::a64::AsmReg;
291 using DefaultCCAssigner = CCAssignerAAPCS;
292
293 static constexpr RegBank GP_BANK{0};
294 static constexpr RegBank FP_BANK{1};
295 static constexpr bool FRAME_INDEXING_NEGATIVE = false;
296 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
297 static constexpr u32 NUM_BANKS = 2;
298};
299
300namespace concepts {
301template <typename T, typename Config>
302concept Compiler = tpde::Compiler<T, Config> && requires(T a) {
303 {
304 a.arg_is_int128(std::declval<typename T::IRValueRef>())
305 } -> std::convertible_to<bool>;
306
307 {
308 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
309 } -> std::convertible_to<bool>;
310};
311} // namespace concepts
312
313template <IRAdaptor Adaptor,
314 typename Derived,
315 template <typename, typename, typename> typename BaseTy =
316 CompilerBase,
317 typename Config = PlatformConfig>
318struct CompilerA64 : BaseTy<Adaptor, Derived, Config> {
319 using Base = BaseTy<Adaptor, Derived, Config>;
320
321 using IRValueRef = typename Base::IRValueRef;
322 using IRBlockRef = typename Base::IRBlockRef;
323 using IRFuncRef = typename Base::IRFuncRef;
324
325 using ScratchReg = typename Base::ScratchReg;
326 using ValuePartRef = typename Base::ValuePartRef;
327 using ValuePart = typename Base::ValuePart;
328 using GenericValuePart = typename Base::GenericValuePart;
329
330 using Assembler = typename PlatformConfig::Assembler;
331 using RegisterFile = typename Base::RegisterFile;
332
333 using CallArg = typename Base::CallArg;
334
335 using Base::derived;
336
337
338 // TODO(ts): make this dependent on the number of callee-saved regs of the
339 // current function or if there is a call in the function?
340 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
341 6};
342
343 enum CPU_FEATURES : u32 {
344 CPU_BASELINE = 0, // ARMV8.0
345 };
346
347 CPU_FEATURES cpu_feats = CPU_BASELINE;
348
349 // When handling function arguments, we need to prevent argument registers
350 // from being handed out as fixed registers
351 //
352 // Additionally, we prevent R0 and R1 from being fixed assignments to
353 // prevent issues with exception handling
354 u64 fixed_assignment_nonallocatable_mask =
355 create_bitmask({AsmReg::R0, AsmReg::R1});
356 u32 func_start_off = 0u, func_prologue_alloc = 0u, func_epilogue_alloc = 0u;
357 /// Offset to the `add sp, sp, XXX` instruction that the argument handling
358 /// uses to access stack arguments if needed
359 u32 func_arg_stack_add_off = ~0u;
360 AsmReg func_arg_stack_add_reg = AsmReg::make_invalid();
361
362 /// Permanent scratch register, e.g. to materialize constants/offsets. This is
363 /// used by materialize_constant, load_from_stack, spill_reg.
364 AsmReg permanent_scratch_reg = AsmReg::R16;
365
366 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
367 u32 reg_save_frame_off = 0;
368 util::SmallVector<u32, 8> func_ret_offs = {};
369
370 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
371 u32 stack_adjust_off = 0;
372 u32 stack_size = 0;
373 u32 stack_sub = 0;
374
375 void set_stack_used() noexcept;
376
377 public:
378 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
379 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
380
381 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
382 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
383 void call_impl(
384 std::variant<typename Assembler::SymRef, ValuePart> &&) noexcept;
385 void reset_stack() noexcept;
386 };
387
388 // for now, always generate an object
389 explicit CompilerA64(Adaptor *adaptor,
390 const CPU_FEATURES cpu_features = CPU_BASELINE)
391 : Base{adaptor}, cpu_feats(cpu_features) {
392 static_assert(std::is_base_of_v<CompilerA64, Derived>);
393 static_assert(concepts::Compiler<Derived, PlatformConfig>);
394 }
395
396 void start_func(u32 func_idx) noexcept;
397
398 void gen_func_prolog_and_args(CCAssigner *cc_assigner) noexcept;
399
400 // note: this has to call assembler->end_func
401 void finish_func(u32 func_idx) noexcept;
402
403 void reset() noexcept;
404
405 // helpers
406
407 void gen_func_epilog() noexcept;
408
409 void
410 spill_reg(const AsmReg reg, const u32 frame_off, const u32 size) noexcept;
411
412 void load_from_stack(AsmReg dst,
413 i32 frame_off,
414 u32 size,
415 bool sign_extend = false) noexcept;
416
417 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
418
419 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
420
421 GenericValuePart val_spill_slot(ValuePart &val_ref) noexcept {
422 const auto ap = val_ref.assignment();
423 assert(ap.stack_valid() && !ap.variable_ref());
424 return typename GenericValuePart::Expr(AsmReg::R29, ap.frame_off());
425 }
426
427 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
428
429 void materialize_constant(const u64 *data,
430 RegBank bank,
431 u32 size,
432 AsmReg dst) noexcept;
433 void materialize_constant(u64 const_u64,
434 RegBank bank,
435 u32 size,
436 AsmReg dst) noexcept {
437 assert(size <= sizeof(const_u64));
438 materialize_constant(&const_u64, bank, size, dst);
439 }
440
441 AsmReg select_fixed_assignment_reg(RegBank bank, IRValueRef) noexcept;
442
443 struct Jump {
444 enum Kind : uint8_t {
445 Jeq,
446 Jne,
447 Jcs,
448 Jhs = Jcs,
449 Jcc,
450 Jlo = Jcc,
451 Jmi,
452 Jpl,
453 Jvs,
454 Jvc,
455 Jhi,
456 Jls,
457 Jge,
458 Jlt,
459 Jgt,
460 Jle,
461 // TDOO: consistency
462 jmp,
463 Cbz,
464 Cbnz,
465 Tbz,
466 Tbnz
467 };
468
469 Kind kind;
470 AsmReg cmp_reg;
471 bool cmp_is_32;
472 u8 test_bit;
473
474 constexpr Jump() : kind(Kind::jmp) {}
475
476 constexpr Jump(Kind kind) : kind(kind), cmp_is_32(false), test_bit(0) {
477 assert(kind != Cbz && kind != Cbnz && kind != Tbz && kind != Tbnz);
478 }
479
480 constexpr Jump(Kind kind, AsmReg cmp_reg, bool cmp_is_32)
481 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(cmp_is_32), test_bit(0) {
482 assert(kind == Cbz || kind == Cbnz);
483 }
484
485 constexpr Jump(Kind kind, AsmReg cmp_reg, u8 test_bit)
486 : kind(kind), cmp_reg(cmp_reg), cmp_is_32(false), test_bit(test_bit) {
487 assert(kind == Tbz || kind == Tbnz);
488 }
489
490 constexpr Jump change_kind(Kind new_kind) const {
491 auto cpy = *this;
492 cpy.kind = new_kind;
493 return cpy;
494 }
495 };
496
497 Jump invert_jump(Jump jmp) noexcept;
498 Jump swap_jump(Jump jmp) noexcept;
499
500 void generate_branch_to_block(Jump jmp,
501 IRBlockRef target,
502 bool needs_split,
503 bool last_inst) noexcept;
504
505 void generate_raw_jump(Jump jmp, Assembler::Label target) noexcept;
506
507 void generate_raw_set(Jump jmp, AsmReg dst) noexcept;
508 void generate_raw_mask(Jump jmp, AsmReg dst) noexcept;
509
510 void generate_raw_intext(
511 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
512
513 /// Generate a function call
514 ///
515 /// This will get the arguments into the correct registers according to the
516 /// calling convention, clear non-callee-saved registers from the register
517 /// file (make sure you do not have any fixed assignments left over) and
518 /// fill the result registers (the u8 in the ScratchReg pair indicates the
519 /// register bank)
520 ///
521 /// Targets can be a symbol (call to PLT with relocation), or an indirect
522 /// call to a ValuePart. Result is an optional reference.
523 void generate_call(std::variant<Assembler::SymRef, ValuePart> &&target,
524 std::span<CallArg> arguments,
525 typename Base::ValueRef *result,
526 bool variable_args = false);
527
528 /// Generate code sequence to load address of sym into a register. This will
529 /// generate a function call for dynamic TLS access models.
530 ScratchReg tls_get_addr(Assembler::SymRef sym, TLSModel model) noexcept;
531
532 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
533 return ((cpu_feats & feats) == feats);
534 }
535};
536
537template <IRAdaptor Adaptor,
538 typename Derived,
539 template <typename, typename, typename> class BaseTy,
540 typename Config>
541void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
542 set_stack_used() noexcept {
543 if (stack_adjust_off == 0) {
544 this->compiler.text_writer.ensure_space(16);
545 stack_adjust_off = this->compiler.text_writer.offset();
546 this->compiler.text_writer.cur_ptr() += 4;
547 }
548}
549
550template <IRAdaptor Adaptor,
551 typename Derived,
552 template <typename, typename, typename> class BaseTy,
553 typename Config>
554void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
555 ValuePart &vp, CCAssignment &cca) noexcept {
556 AsmReg ptr_reg = vp.load_to_reg(&this->compiler);
557 AsmReg tmp_reg = AsmReg::R16;
558
559 auto size = cca.byval_size;
560 set_stack_used();
561 for (u32 off = 0; off < size;) {
562 if (size - off >= 8) {
563 ASMC(&this->compiler, LDRxu, tmp_reg, ptr_reg, off);
564 ASMC(&this->compiler, STRxu, tmp_reg, DA_SP, cca.stack_off + off);
565 off += 8;
566 } else if (size - off >= 4) {
567 ASMC(&this->compiler, LDRwu, tmp_reg, ptr_reg, off);
568 ASMC(&this->compiler, STRwu, tmp_reg, DA_SP, cca.stack_off + off);
569 off += 4;
570 } else if (size - off >= 2) {
571 ASMC(&this->compiler, LDRHu, tmp_reg, ptr_reg, off);
572 ASMC(&this->compiler, STRHu, tmp_reg, DA_SP, cca.stack_off + off);
573 off += 2;
574 } else {
575 ASMC(&this->compiler, LDRBu, tmp_reg, ptr_reg, off);
576 ASMC(&this->compiler, STRBu, tmp_reg, DA_SP, cca.stack_off + off);
577 off += 1;
578 }
579 }
580}
581
582template <IRAdaptor Adaptor,
583 typename Derived,
584 template <typename, typename, typename> class BaseTy,
585 typename Config>
586void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
587 ValuePart &vp, CCAssignment &cca) noexcept {
588 set_stack_used();
589
590 auto reg = vp.load_to_reg(&this->compiler);
591 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
592 switch (cca.size) {
593 case 1: ASMC(&this->compiler, STRBu, reg, DA_SP, cca.stack_off); break;
594 case 2: ASMC(&this->compiler, STRHu, reg, DA_SP, cca.stack_off); break;
595 case 4: ASMC(&this->compiler, STRwu, reg, DA_SP, cca.stack_off); break;
596 case 8: ASMC(&this->compiler, STRxu, reg, DA_SP, cca.stack_off); break;
597 default: TPDE_UNREACHABLE("invalid GP reg size");
598 }
599 } else {
600 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
601 switch (cca.size) {
602 case 1: ASMC(&this->compiler, STRbu, reg, DA_SP, cca.stack_off); break;
603 case 2: ASMC(&this->compiler, STRhu, reg, DA_SP, cca.stack_off); break;
604 case 4: ASMC(&this->compiler, STRsu, reg, DA_SP, cca.stack_off); break;
605 case 8: ASMC(&this->compiler, STRdu, reg, DA_SP, cca.stack_off); break;
606 case 16: ASMC(&this->compiler, STRqu, reg, DA_SP, cca.stack_off); break;
607 default: TPDE_UNREACHABLE("invalid FP reg size");
608 }
609 }
610}
611
612template <IRAdaptor Adaptor,
613 typename Derived,
614 template <typename, typename, typename> class BaseTy,
615 typename Config>
616void CompilerA64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
617 std::variant<typename Assembler::SymRef, ValuePart> &&target) noexcept {
618 u32 sub = 0;
619 if (stack_adjust_off != 0) {
620 auto *text_data = this->compiler.text_writer.begin_ptr();
621 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + stack_adjust_off);
622 u32 stack_size = this->assigner.get_stack_size();
623 sub = util::align_up(stack_size, stack_size < 0x1000 ? 0x10 : 0x1000);
624 *write_ptr = de64_SUBxi(DA_SP, DA_SP, sub);
625 } else {
626 assert(this->assigner.get_stack_size() == 0);
627 }
628
629
630 if (auto *sym = std::get_if<typename Assembler::SymRef>(&target)) {
631 ASMC(&this->compiler, BL, 0);
632 this->compiler.reloc_text(
633 *sym, R_AARCH64_CALL26, this->compiler.text_writer.offset() - 4);
634 } else {
635 ValuePart &tvp = std::get<ValuePart>(target);
636 AsmReg reg = tvp.cur_reg_unlocked();
637 if (!reg.valid()) {
638 reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R16);
639 }
640 ASMC(&this->compiler, BLR, reg);
641 tvp.reset(&this->compiler);
642 }
643
644 if (stack_adjust_off != 0) {
645 ASMC(&this->compiler, ADDxi, DA_SP, DA_SP, sub);
646 }
647}
648
649template <IRAdaptor Adaptor,
650 typename Derived,
651 template <typename, typename, typename> class BaseTy,
652 typename Config>
653void CompilerA64<Adaptor, Derived, BaseTy, Config>::start_func(
654 const u32 /*func_idx*/) noexcept {
655 this->assembler.except_begin_func();
656 this->text_writer.align(16);
657}
658
659template <IRAdaptor Adaptor,
660 typename Derived,
661 template <typename, typename, typename> typename BaseTy,
662 typename Config>
663void CompilerA64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
664 CCAssigner *cc_assigner) noexcept {
665 // prologue:
666 // sub sp, sp, #<frame_size>
667 // stp x29, x30, [sp]
668 // mov x29, sp
669 // optionally create vararg save-area
670 // reserve space for callee-saved regs
671 // 4 byte per callee-saved reg pair since for each we do
672 // stp r1, r2, [sp + XX]
673
674 // TODO(ts): for smaller functions we could enable an optimization
675 // to store the saved regs after the local variables
676 // which we could then use to not allocate space for unsaved regs
677 // which could help in the common case.
678 // However, we need to commit to this at the beginning of the function
679 // as otherwise stack accesses need to skip the reg-save area
680
681 func_ret_offs.clear();
682 func_start_off = this->text_writer.offset();
683
684 const CCInfo &cc_info = cc_assigner->get_ccinfo();
685
686 // We don't actually generate the prologue here and merely allocate space
687 // for it. Right now, we don't know which callee-saved registers will be
688 // used. While we could pad with nops, we later move the beginning of the
689 // function so that small functions don't have to execute 9 nops.
690 // See finish_func.
691 this->stack.frame_size = 16; // FP, LR
692 {
693 auto csr = cc_info.callee_saved_regs;
694 auto csr_gp = csr & this->register_file.bank_regs(Config::GP_BANK);
695 auto csr_fp = csr & this->register_file.bank_regs(Config::FP_BANK);
696 u32 gp_saves = std::popcount(csr_gp);
697 u32 fp_saves = std::popcount(csr_fp);
698 // LDP/STP can handle two registers of the same bank.
699 u32 reg_save_size = 4 * ((gp_saves + 1) / 2 + (fp_saves + 1) / 2);
700 // TODO: support CSR of Qx/Vx registers, not just Dx
701 this->stack.frame_size += util::align_up(gp_saves * 8 + fp_saves * 8, 16);
702
703 // Reserve space for sub sp, stp x29/x30, and mov x29, sp.
704 func_prologue_alloc = reg_save_size + 12;
705 this->text_writer.ensure_space(func_prologue_alloc);
706 this->text_writer.cur_ptr() += func_prologue_alloc;
707 // ldp needs the same number of instructions as stp
708 // additionally, there's an add sp, ldp x29/x30, ret (+12)
709 func_epilogue_alloc = reg_save_size + 12;
710 // extra mov sp, fp
711 func_epilogue_alloc += this->adaptor->cur_has_dynamic_alloca() ? 4 : 0;
712 }
713
714 // TODO(ts): support larger stack alignments?
715
716 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
717 reg_save_frame_off = this->stack.frame_size;
718 // We additionally store a pointer to the stack area, which we can't compute
719 // with a constant offset from the frame pointer. Add 16 bytes to maintain
720 // alignment.
721 this->stack.frame_size += 8 * 8 + 8 * 16 + 16;
722 this->text_writer.ensure_space(4 * 8);
723 ASMNC(STPx, DA_GP(0), DA_GP(1), DA_SP, reg_save_frame_off);
724 ASMNC(STPx, DA_GP(2), DA_GP(3), DA_SP, reg_save_frame_off + 16);
725 ASMNC(STPx, DA_GP(4), DA_GP(5), DA_SP, reg_save_frame_off + 32);
726 ASMNC(STPx, DA_GP(6), DA_GP(7), DA_SP, reg_save_frame_off + 48);
727 ASMNC(STPq, DA_V(0), DA_V(1), DA_SP, reg_save_frame_off + 64);
728 ASMNC(STPq, DA_V(2), DA_V(3), DA_SP, reg_save_frame_off + 96);
729 ASMNC(STPq, DA_V(4), DA_V(5), DA_SP, reg_save_frame_off + 128);
730 ASMNC(STPq, DA_V(6), DA_V(7), DA_SP, reg_save_frame_off + 160);
731 }
732
733 // Temporarily prevent argument registers from being assigned.
734 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
735 "argument registers must also be allocatable");
736 this->register_file.allocatable &= ~cc_info.arg_regs;
737
738 this->func_arg_stack_add_off = ~0u;
739
740 u32 arg_idx = 0;
741 for (const IRValueRef arg : this->adaptor->cur_args()) {
742 derived()->handle_func_arg(
743 arg_idx, arg, [&](ValuePart &&vp, CCAssignment cca) {
744 cca.bank = vp.bank();
745 cca.size = vp.part_size();
746
747 cc_assigner->assign_arg(cca);
748
749 if (cca.reg.valid()) [[likely]] {
750 vp.set_value_reg(this, cca.reg);
751 // Mark register as allocatable as soon as it is assigned. If the
752 // argument is unused, the register will be freed immediately and
753 // can be used for later stack arguments.
754 this->register_file.allocatable |= u64{1} << cca.reg.id();
755 return;
756 }
757
758 this->text_writer.ensure_space(8);
759 AsmReg stack_reg = AsmReg::R17;
760 // TODO: allocate an actual scratch register for this.
761 assert(
762 !(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
763 "x17 must not be allocatable");
764 if (this->func_arg_stack_add_off == ~0u) {
765 this->func_arg_stack_add_off = this->text_writer.offset();
766 this->func_arg_stack_add_reg = stack_reg;
767 // Fixed in finish_func when frame size is known
768 ASMNC(ADDxi, stack_reg, DA_SP, 0);
769 }
770
771 AsmReg dst = vp.alloc_reg(this);
772 if (cca.byval) {
773 ASM(ADDxi, dst, stack_reg, cca.stack_off);
774 } else if (cca.bank == Config::GP_BANK) {
775 switch (cca.size) {
776 case 1: ASMNC(LDRBu, dst, stack_reg, cca.stack_off); break;
777 case 2: ASMNC(LDRHu, dst, stack_reg, cca.stack_off); break;
778 case 4: ASMNC(LDRwu, dst, stack_reg, cca.stack_off); break;
779 case 8: ASMNC(LDRxu, dst, stack_reg, cca.stack_off); break;
780 default: TPDE_UNREACHABLE("invalid GP reg size");
781 }
782 } else {
783 assert(cca.bank == Config::FP_BANK);
784 switch (cca.size) {
785 case 1: ASMNC(LDRbu, dst, stack_reg, cca.stack_off); break;
786 case 2: ASMNC(LDRhu, dst, stack_reg, cca.stack_off); break;
787 case 4: ASMNC(LDRsu, dst, stack_reg, cca.stack_off); break;
788 case 8: ASMNC(LDRdu, dst, stack_reg, cca.stack_off); break;
789 case 16: ASMNC(LDRqu, dst, stack_reg, cca.stack_off); break;
790 default: TPDE_UNREACHABLE("invalid FP reg size");
791 }
792 }
793 });
794
795 arg_idx += 1;
796 }
797
798 // Hack: we don't know the frame size, so for a va_start(), we cannot easily
799 // compute the offset from the frame pointer. But we have a stack_reg here,
800 // so use it for var args.
801 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
802 AsmReg stack_reg = AsmReg::R17;
803 // TODO: allocate an actual scratch register for this.
804 assert(!(this->register_file.allocatable & (u64{1} << stack_reg.id())) &&
805 "x17 must not be allocatable");
806 if (this->func_arg_stack_add_off == ~0u) {
807 this->func_arg_stack_add_off = this->text_writer.offset();
808 this->func_arg_stack_add_reg = stack_reg;
809 // Fixed in finish_func when frame size is known
810 ASMC(this, ADDxi, stack_reg, DA_SP, 0);
811 }
812 ASM(ADDxi, stack_reg, stack_reg, cc_assigner->get_stack_size());
813 ASM(STRxu, stack_reg, DA_GP(29), this->reg_save_frame_off + 192);
814
815 // TODO: extract ngrn/nsrn from CCAssigner
816 // TODO: this isn't quite accurate, e.g. for (i128, i128, i128, i64, i128),
817 // this should be 8 but will end up with 7.
818 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
819 u32 ngrn = 8 - util::cnt_lz<u16>((arg_regs & 0xff) << 8 | 0x80);
820 u32 nsrn = 8 - util::cnt_lz<u16>(((arg_regs >> 32) & 0xff) << 8 | 0x80);
821 this->scalar_arg_count = ngrn;
822 this->vec_arg_count = nsrn;
823 }
824
825 this->register_file.allocatable |= cc_info.arg_regs;
826}
827
828template <IRAdaptor Adaptor,
829 typename Derived,
830 template <typename, typename, typename> typename BaseTy,
831 typename Config>
832void CompilerA64<Adaptor, Derived, BaseTy, Config>::finish_func(
833 u32 func_idx) noexcept {
834 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
835 u64 saved_regs = this->register_file.clobbered & csr;
836
837 const auto dyn_alloca = this->adaptor->cur_has_dynamic_alloca();
838 auto stack_reg = DA_SP;
839 if (dyn_alloca) {
840 stack_reg = DA_GP(29);
841 }
842
843 auto final_frame_size = util::align_up(this->stack.frame_size, 16);
844 if (final_frame_size > 4095) {
845 // round up to 4k since SUB cannot encode immediates greater than 4095
846 final_frame_size = util::align_up(final_frame_size, 4096);
847 assert(final_frame_size < 16 * 1024 * 1024);
848 }
849
850 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
851
852 {
853 // NB: code alignment factor 4, data alignment factor -8.
854 util::SmallVector<u32, 16> prologue;
855 prologue.push_back(de64_SUBxi(DA_SP, DA_SP, final_frame_size));
856 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
857 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset,
858 final_frame_size);
859 prologue.push_back(de64_STPx(DA_GP(29), DA_GP(30), DA_SP, 0));
860 prologue.push_back(de64_MOV_SPx(DA_GP(29), DA_SP));
861 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 2);
862 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
863 dwarf::a64::DW_reg_fp);
864 this->assembler.eh_write_inst(
865 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_fp, final_frame_size / 8);
866 this->assembler.eh_write_inst(
867 dwarf::DW_CFA_offset, dwarf::a64::DW_reg_lr, final_frame_size / 8 - 1);
868
869 // Patched below
870 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
871 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
872
873 AsmReg last_reg = AsmReg::make_invalid();
874 u32 frame_off = 16;
875 for (auto reg : util::BitSetIterator{saved_regs}) {
876 if (last_reg.valid()) {
877 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
878 const auto last_bank = this->register_file.reg_bank(last_reg);
879 if (reg_bank == last_bank) {
880 if (reg_bank == Config::GP_BANK) {
881 prologue.push_back(
882 de64_STPx(last_reg, AsmReg{reg}, stack_reg, frame_off));
883 } else {
884 prologue.push_back(
885 de64_STPd(last_reg, AsmReg{reg}, stack_reg, frame_off));
886 }
887 frame_off += 16;
888 last_reg = AsmReg::make_invalid();
889 } else {
890 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
891 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
892 frame_off += 8;
893 last_reg = AsmReg{reg};
894 }
895 continue;
896 }
897
898 u8 dwarf_base = reg < 32 ? dwarf::a64::DW_reg_v0 : dwarf::a64::DW_reg_x0;
899 u8 dwarf_reg = dwarf_base + reg % 32;
900 u32 cfa_off = (final_frame_size - frame_off) / 8;
901 if ((dwarf_reg & dwarf::DWARF_CFI_PRIMARY_OPCODE_MASK) == 0) {
902 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
903 } else {
904 this->assembler.eh_write_inst(
905 dwarf::DW_CFA_offset_extended, dwarf_reg, cfa_off);
906 }
907
908 last_reg = AsmReg{reg};
909 }
910
911 if (last_reg.valid()) {
912 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
913 prologue.push_back(de64_STRxu(last_reg, stack_reg, frame_off));
914 } else {
915 assert(this->register_file.reg_bank(last_reg) == Config::FP_BANK);
916 prologue.push_back(de64_STRdu(last_reg, stack_reg, frame_off));
917 }
918 }
919
920 assert(prologue.size() * sizeof(u32) <= func_prologue_alloc);
921
922 assert(prologue.size() < 0x4c);
923 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
924 dwarf::DW_CFA_advance_loc | (prologue.size() - 3);
925
926 // Pad with NOPs so that func_prologue_alloc - prologue.size() is a
927 // multiple if 16 (the function alignment).
928 const auto nop_count = (func_prologue_alloc / 4 - prologue.size()) % 4;
929 const auto nop = de64_NOP();
930 for (auto i = 0u; i < nop_count; ++i) {
931 prologue.push_back(nop);
932 }
933
934 // Shrink function at the beginning
935 u32 skip = util::align_down(func_prologue_alloc - prologue.size() * 4, 16);
936 std::memset(this->text_writer.begin_ptr() + func_start_off, 0, skip);
937 func_start_off += skip;
938 this->assembler.sym_set_value(this->func_syms[func_idx], func_start_off);
939 std::memcpy(this->text_writer.begin_ptr() + func_start_off,
940 prologue.data(),
941 prologue.size() * sizeof(u32));
942 }
943
944 if (func_arg_stack_add_off != ~0u) {
945 auto *inst_ptr = this->text_writer.begin_ptr() + func_arg_stack_add_off;
946 *reinterpret_cast<u32 *>(inst_ptr) =
947 de64_ADDxi(func_arg_stack_add_reg, DA_SP, final_frame_size);
948 }
949
950 // TODO(ts): honor cur_needs_unwind_info
951 auto func_sym = this->func_syms[func_idx];
952 auto func_sec = this->text_writer.get_sec_ref();
953
954 if (func_ret_offs.empty()) {
955 auto func_size = this->text_writer.offset() - func_start_off;
956 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
957 this->assembler.eh_end_fde(fde_off, func_sym);
958 this->assembler.except_encode_func(func_sym);
959 return;
960 }
961
962 auto *text_data = this->text_writer.begin_ptr();
963 u32 first_ret_off = func_ret_offs[0];
964 u32 ret_size = 0;
965 {
966 u32 *write_ptr = reinterpret_cast<u32 *>(text_data + first_ret_off);
967 const auto ret_start = write_ptr;
968 if (dyn_alloca) {
969 *write_ptr++ = de64_MOV_SPx(DA_SP, DA_GP(29));
970 } else {
971 *write_ptr++ = de64_LDPx(DA_GP(29), DA_GP(30), DA_SP, 0);
972 }
973
974 AsmReg last_reg = AsmReg::make_invalid();
975 u32 frame_off = 16;
976 for (auto reg : util::BitSetIterator{saved_regs}) {
977 if (last_reg.valid()) {
978 const auto reg_bank = this->register_file.reg_bank(AsmReg{reg});
979 const auto last_bank = this->register_file.reg_bank(last_reg);
980 if (reg_bank == last_bank) {
981 if (reg_bank == Config::GP_BANK) {
982 *write_ptr++ =
983 de64_LDPx(last_reg, AsmReg{reg}, stack_reg, frame_off);
984 } else {
985 *write_ptr++ =
986 de64_LDPd(last_reg, AsmReg{reg}, stack_reg, frame_off);
987 }
988 frame_off += 16;
989 last_reg = AsmReg::make_invalid();
990 } else {
991 assert(last_bank == Config::GP_BANK && reg_bank == Config::FP_BANK);
992 *write_ptr++ = de64_LDRxu(last_reg, stack_reg, frame_off);
993 frame_off += 8;
994 last_reg = AsmReg{reg};
995 }
996 continue;
997 }
998
999 last_reg = AsmReg{reg};
1000 }
1001
1002 if (last_reg.valid()) {
1003 if (this->register_file.reg_bank(last_reg) == Config::GP_BANK) {
1004 *write_ptr++ = de64_LDRxu(last_reg, stack_reg, frame_off);
1005 } else {
1006 *write_ptr++ = de64_LDRdu(last_reg, stack_reg, frame_off);
1007 }
1008 }
1009
1010 if (dyn_alloca) {
1011 *write_ptr++ = de64_LDPx(DA_GP(29), DA_GP(30), DA_SP, 0);
1012 }
1013
1014 *write_ptr++ = de64_ADDxi(DA_SP, DA_SP, final_frame_size);
1015 *write_ptr++ = de64_RET(DA_GP(30));
1016
1017 ret_size = (write_ptr - ret_start) * 4;
1018 assert(ret_size <= func_epilogue_alloc);
1019 std::memset(write_ptr, 0, func_epilogue_alloc - ret_size);
1020 }
1021
1022 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
1023 std::memcpy(text_data + func_ret_offs[i],
1024 text_data + first_ret_off,
1025 func_epilogue_alloc);
1026 }
1027
1028 u32 func_end_ret_off = this->text_writer.offset() - func_epilogue_alloc;
1029 if (func_ret_offs.back() == func_end_ret_off) {
1030 this->text_writer.cur_ptr() -= func_epilogue_alloc - ret_size;
1031 }
1032
1033 auto func_size = this->text_writer.offset() - func_start_off;
1034 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
1035 this->assembler.eh_end_fde(fde_off, func_sym);
1036 this->assembler.except_encode_func(func_sym);
1037}
1038
1039template <IRAdaptor Adaptor,
1040 typename Derived,
1041 template <typename, typename, typename> typename BaseTy,
1042 typename Config>
1043void CompilerA64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
1044 func_ret_offs.clear();
1045 Base::reset();
1046}
1047
1048template <IRAdaptor Adaptor,
1049 typename Derived,
1050 template <typename, typename, typename> typename BaseTy,
1051 typename Config>
1052void CompilerA64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
1053 // epilogue:
1054 // if !func_has_dynamic_alloca:
1055 // ldp x29, x30, [sp]
1056 // else:
1057 // mov sp, fp
1058 // for each saved reg pair:
1059 // if func_has_dynamic_alloca:
1060 // ldp r1, r2, [fp, #<off>]
1061 // else:
1062 // ldp r1, r2, [sp, #<off>]
1063 // if func_has_dynamic_alloca:
1064 // ldp x29, x30, [sp]
1065 // add sp, sp, #<frame_size>
1066 // ret
1067 //
1068 // however, since we will later patch this, we only
1069 // reserve the space for now
1070
1071 func_ret_offs.push_back(this->text_writer.offset());
1072 this->text_writer.ensure_space(func_epilogue_alloc);
1073 this->text_writer.cur_ptr() += func_epilogue_alloc;
1074}
1075
1076template <IRAdaptor Adaptor,
1077 typename Derived,
1078 template <typename, typename, typename> typename BaseTy,
1079 typename Config>
1080void CompilerA64<Adaptor, Derived, BaseTy, Config>::spill_reg(
1081 const AsmReg reg, const u32 frame_off, const u32 size) noexcept {
1082 assert((size & (size - 1)) == 0);
1083 assert(util::align_up(frame_off, size) == frame_off);
1084 // We don't support stack frames that aren't encodeable with add/sub.
1085 assert(frame_off < 0x1'000'000);
1086
1087 u32 off = frame_off;
1088 auto addr_base = AsmReg{AsmReg::FP};
1089 if (off >= 0x1000 * size) [[unlikely]] {
1090 // We cannot encode the offset in the store instruction.
1091 ASM(ADDxi, permanent_scratch_reg, DA_GP(29), off & ~0xfff);
1092 off &= 0xfff;
1093 addr_base = permanent_scratch_reg;
1094 }
1095
1096 this->text_writer.ensure_space(4);
1097 assert(-static_cast<i32>(frame_off) < 0);
1098 if (reg.id() <= AsmReg::R30) {
1099 switch (size) {
1100 case 1: ASMNC(STRBu, reg, addr_base, off); break;
1101 case 2: ASMNC(STRHu, reg, addr_base, off); break;
1102 case 4: ASMNC(STRwu, reg, addr_base, off); break;
1103 case 8: ASMNC(STRxu, reg, addr_base, off); break;
1104 default: TPDE_UNREACHABLE("invalid register spill size");
1105 }
1106 } else {
1107 switch (size) {
1108 case 1: ASMNC(STRbu, reg, addr_base, off); break;
1109 case 2: ASMNC(STRhu, reg, addr_base, off); break;
1110 case 4: ASMNC(STRsu, reg, addr_base, off); break;
1111 case 8: ASMNC(STRdu, reg, addr_base, off); break;
1112 case 16: ASMNC(STRqu, reg, addr_base, off); break;
1113 default: TPDE_UNREACHABLE("invalid register spill size");
1114 }
1115 }
1116}
1117
1118template <IRAdaptor Adaptor,
1119 typename Derived,
1120 template <typename, typename, typename> typename BaseTy,
1121 typename Config>
1122void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
1123 const AsmReg dst,
1124 const i32 frame_off,
1125 const u32 size,
1126 const bool sign_extend) noexcept {
1127 assert((size & (size - 1)) == 0);
1128 assert(util::align_up(frame_off, size) == frame_off);
1129 // We don't support stack frames that aren't encodeable with add/sub.
1130 assert(frame_off >= 0 && frame_off < 0x1'000'000);
1131
1132 u32 off = frame_off;
1133 auto addr_base = AsmReg{AsmReg::FP};
1134 if (off >= 0x1000 * size) [[unlikely]] {
1135 // need to calculate this explicitely
1136 addr_base = dst.id() <= AsmReg::R30 ? dst : permanent_scratch_reg;
1137 ASM(ADDxi, addr_base, DA_GP(29), off & ~0xfff);
1138 off &= 0xfff;
1139 }
1140
1141 this->text_writer.ensure_space(4);
1142 if (dst.id() <= AsmReg::R30) {
1143 if (!sign_extend) {
1144 switch (size) {
1145 case 1: ASMNC(LDRBu, dst, addr_base, off); break;
1146 case 2: ASMNC(LDRHu, dst, addr_base, off); break;
1147 case 4: ASMNC(LDRwu, dst, addr_base, off); break;
1148 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1149 default: TPDE_UNREACHABLE("invalid register spill size");
1150 }
1151 } else {
1152 switch (size) {
1153 case 1: ASMNC(LDRSBwu, dst, addr_base, off); break;
1154 case 2: ASMNC(LDRSHwu, dst, addr_base, off); break;
1155 case 4: ASMNC(LDRSWxu, dst, addr_base, off); break;
1156 case 8: ASMNC(LDRxu, dst, addr_base, off); break;
1157 default: TPDE_UNREACHABLE("invalid register spill size");
1158 }
1159 }
1160 return;
1161 }
1162
1163 assert(!sign_extend);
1164
1165 switch (size) {
1166 case 1: ASMNC(LDRbu, dst, addr_base, off); break;
1167 case 2: ASMNC(LDRhu, dst, addr_base, off); break;
1168 case 4: ASMNC(LDRsu, dst, addr_base, off); break;
1169 case 8: ASMNC(LDRdu, dst, addr_base, off); break;
1170 case 16: ASMNC(LDRqu, dst, addr_base, off); break;
1171 default: TPDE_UNREACHABLE("invalid register spill size");
1172 }
1173}
1174
1175template <IRAdaptor Adaptor,
1176 typename Derived,
1177 template <typename, typename, typename> typename BaseTy,
1178 typename Config>
1179void CompilerA64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
1180 const AsmReg dst, const AssignmentPartRef ap) noexcept {
1181 auto frame_off = ap.variable_stack_off();
1182 assert(frame_off >= 0);
1183 if (!ASMIF(ADDxi, dst, DA_GP(29), frame_off)) {
1184 materialize_constant(frame_off, Config::GP_BANK, 4, dst);
1185 ASM(ADDx_uxtw, dst, DA_GP(29), dst, 0);
1186 }
1187}
1188
1189template <IRAdaptor Adaptor,
1190 typename Derived,
1191 template <typename, typename, typename> typename BaseTy,
1192 typename Config>
1193void CompilerA64<Adaptor, Derived, BaseTy, Config>::mov(
1194 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
1195 assert(dst.valid());
1196 assert(src.valid());
1197 if (dst.id() <= AsmReg::SP && src.id() <= AsmReg::SP) {
1198 assert(dst.id() != AsmReg::SP && src.id() != AsmReg::SP);
1199 if (size > 4) {
1200 ASM(MOVx, dst, src);
1201 } else {
1202 ASM(MOVw, dst, src);
1203 }
1204 } else if (dst.id() >= AsmReg::V0 && src.id() >= AsmReg::V0) {
1205 ASM(ORR16b, dst, src, src);
1206 } else if (dst.id() <= AsmReg::SP) {
1207 assert(dst.id() != AsmReg::SP);
1208 // gp<-vector
1209 assert(src.id() >= AsmReg::V0);
1210 assert(size <= 8);
1211 if (size <= 4) {
1212 ASM(FMOVws, dst, src);
1213 } else {
1214 ASM(FMOVxd, dst, src);
1215 }
1216 } else {
1217 // vector<-gp
1218 assert(src.id() <= AsmReg::R30);
1219 assert(dst.id() >= AsmReg::V0);
1220 assert(size <= 8);
1221 if (size <= 4) {
1222 ASM(FMOVsw, dst, src);
1223 } else {
1224 ASM(FMOVdx, dst, src);
1225 }
1226 }
1227}
1228
1229template <IRAdaptor Adaptor,
1230 typename Derived,
1231 template <typename, typename, typename> typename BaseTy,
1232 typename Config>
1233AsmReg CompilerA64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1234 GenericValuePart &gv) noexcept {
1235 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1236
1237 ScratchReg scratch{derived()};
1238 if (!expr.has_base() && !expr.has_index()) {
1239 AsmReg dst = scratch.alloc_gp();
1240 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, dst);
1241 expr.disp = 0;
1242 } else if (!expr.has_base() && expr.has_index()) {
1243 AsmReg index_reg = expr.index_reg();
1244 if (std::holds_alternative<ScratchReg>(expr.index)) {
1245 scratch = std::move(std::get<ScratchReg>(expr.index));
1246 } else {
1247 (void)scratch.alloc_gp();
1248 }
1249 AsmReg dst = scratch.cur_reg();
1250 if ((expr.scale & (expr.scale - 1)) == 0) {
1251 const auto shift = util::cnt_tz<u64>(expr.scale);
1252 ASM(LSLxi, dst, index_reg, shift);
1253 } else {
1254 AsmReg tmp2 = permanent_scratch_reg;
1255 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1256 ASM(MULx, dst, index_reg, tmp2);
1257 }
1258 } else if (expr.has_base() && expr.has_index()) {
1259 AsmReg base_reg = expr.base_reg();
1260 AsmReg index_reg = expr.index_reg();
1261 if (std::holds_alternative<ScratchReg>(expr.base)) {
1262 scratch = std::move(std::get<ScratchReg>(expr.base));
1263 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1264 scratch = std::move(std::get<ScratchReg>(expr.index));
1265 } else {
1266 (void)scratch.alloc_gp();
1267 }
1268 AsmReg dst = scratch.cur_reg();
1269 if ((expr.scale & (expr.scale - 1)) == 0) {
1270 const auto shift = util::cnt_tz<u64>(expr.scale);
1271 ASM(ADDx_lsl, dst, base_reg, index_reg, shift);
1272 } else {
1273 AsmReg tmp2 = permanent_scratch_reg;
1274 derived()->materialize_constant(expr.scale, Config::GP_BANK, 8, tmp2);
1275 ASM(MADDx, dst, index_reg, tmp2, base_reg);
1276 }
1277 } else if (expr.has_base() && !expr.has_index()) {
1278 AsmReg base_reg = expr.base_reg();
1279 if (std::holds_alternative<ScratchReg>(expr.base)) {
1280 scratch = std::move(std::get<ScratchReg>(expr.base));
1281 } else {
1282 (void)scratch.alloc_gp();
1283 }
1284 AsmReg dst = scratch.cur_reg();
1285 if (expr.disp != 0 && ASMIF(ADDxi, dst, base_reg, expr.disp)) {
1286 expr.disp = 0;
1287 } else if (dst != base_reg) {
1288 ASM(MOVx, dst, base_reg);
1289 }
1290 } else {
1291 TPDE_UNREACHABLE("inconsistent GenericValuePart::Expr");
1292 }
1293
1294 AsmReg dst = scratch.cur_reg();
1295 if (expr.disp != 0) {
1296 if (!ASMIF(ADDxi, dst, dst, expr.disp)) {
1297 AsmReg tmp2 = permanent_scratch_reg;
1298 derived()->materialize_constant(expr.disp, Config::GP_BANK, 8, tmp2);
1299 ASM(ADDx, dst, dst, tmp2);
1300 }
1301 }
1302
1303 gv.state = std::move(scratch);
1304 return dst;
1305}
1306
1307template <IRAdaptor Adaptor,
1308 typename Derived,
1309 template <typename, typename, typename> typename BaseTy,
1310 typename Config>
1311void CompilerA64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1312 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1313 const auto const_u64 = data[0];
1314 if (bank == Config::GP_BANK) {
1315 assert(size <= 8);
1316 if (const_u64 == 0) {
1317 ASM(MOVZw, dst, 0);
1318 return;
1319 }
1320
1321 this->text_writer.ensure_space(5 * 4);
1322 this->text_writer.cur_ptr() +=
1323 sizeof(u32) *
1324 de64_MOVconst(reinterpret_cast<u32 *>(this->text_writer.cur_ptr()),
1325 dst,
1326 const_u64);
1327 return;
1328 }
1329
1330 assert(bank == Config::FP_BANK);
1331 // Try instructions that take an immediate
1332 if (size == 4) {
1333 if (ASMIF(FMOVsi, dst, std::bit_cast<float>((u32)const_u64))) {
1334 return;
1335 } else if (ASMIF(MOVId, dst, static_cast<u32>(const_u64))) {
1336 return;
1337 }
1338 } else if (size == 8) {
1339 if (ASMIF(FMOVdi, dst, std::bit_cast<double>(const_u64))) {
1340 return;
1341 } else if (ASMIF(MOVId, dst, const_u64)) {
1342 return;
1343 }
1344 } else if (size == 16) {
1345 const auto high_u64 = data[1];
1346 if (const_u64 == high_u64 && ASMIF(MOVI2d, dst, const_u64)) {
1347 return;
1348 } else if (high_u64 == 0 && ASMIF(MOVId, dst, const_u64)) {
1349 return;
1350 }
1351 }
1352
1353 // We must either load through a GP register of from memory. Both cases need a
1354 // GP register in the common case. We reserve x16/x17 for cases like this.
1355 if (size <= 16) {
1356 this->register_file.mark_clobbered(permanent_scratch_reg);
1357 // Copy from a GP register
1358 // TODO: always load from memory?
1359 if (size <= 8) {
1360 materialize_constant(data, Config::GP_BANK, size, permanent_scratch_reg);
1361 if (size <= 4) {
1362 ASMNC(FMOVsw, dst, permanent_scratch_reg);
1363 } else {
1364 ASMNC(FMOVdx, dst, permanent_scratch_reg);
1365 }
1366 return;
1367 }
1368
1369 auto rodata = this->assembler.get_data_section(true, false);
1370 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), size};
1371 auto sym = this->assembler.sym_def_data(
1372 rodata, "", raw_data, 16, Assembler::SymBinding::LOCAL);
1373 this->text_writer.ensure_space(8); // ensure contiguous instructions
1374 this->reloc_text(
1375 sym, R_AARCH64_ADR_PREL_PG_HI21, this->text_writer.offset(), 0);
1376 ASMNC(ADRP, permanent_scratch_reg, 0, 0);
1377 this->reloc_text(
1378 sym, R_AARCH64_LDST128_ABS_LO12_NC, this->text_writer.offset(), 0);
1379 ASMNC(LDRqu, dst, permanent_scratch_reg, 0);
1380 return;
1381 }
1382
1383 TPDE_FATAL("unable to materialize constant");
1384}
1385
1386template <IRAdaptor Adaptor,
1387 typename Derived,
1388 template <typename, typename, typename> typename BaseTy,
1389 typename Config>
1390AsmReg
1391 CompilerA64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1392 const RegBank bank, IRValueRef) noexcept {
1393 // TODO(ts): why is this in here?
1394 assert(bank.id() <= Config::NUM_BANKS);
1395 auto reg_mask = this->register_file.bank_regs(bank);
1396 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1397
1398 const auto find_possible_regs = [this,
1399 reg_mask](const u64 preferred_regs) -> u64 {
1400 // try to first get an unused reg, otherwise an unfixed reg
1401 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1402 u64 possible_regs = free_regs & preferred_regs & reg_mask;
1403 if (possible_regs == 0) {
1404 possible_regs = (this->register_file.used & ~this->register_file.fixed) &
1405 preferred_regs & reg_mask;
1406 }
1407 return possible_regs;
1408 };
1409
1410 u64 possible_regs;
1411 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1412 if (derived()->cur_func_may_emit_calls()) {
1413 // we can only allocated fixed assignments from the callee-saved regs
1414 possible_regs = find_possible_regs(csr);
1415 } else {
1416 // try allocating any non-callee saved register first, except the result
1417 // registers
1418 possible_regs = find_possible_regs(~csr);
1419 if (possible_regs == 0) {
1420 // otherwise fallback to callee-saved regs
1421 possible_regs = find_possible_regs(csr);
1422 }
1423 }
1424
1425 if (possible_regs == 0) {
1426 return AsmReg::make_invalid();
1427 }
1428
1429 // try to first get an unused reg, otherwise an unfixed reg
1430 if ((possible_regs & ~this->register_file.used) != 0) {
1431 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1432 }
1433
1434 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1435 const auto reg = AsmReg{reg_id};
1436
1437 if (this->register_file.is_fixed(reg)) {
1438 continue;
1439 }
1440
1441 const auto local_idx = this->register_file.reg_local_idx(reg);
1442 const auto part = this->register_file.reg_part(reg);
1443
1444 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1445 continue;
1446 }
1447 auto *assignment = this->val_assignment(local_idx);
1448 auto ap = AssignmentPartRef{assignment, part};
1449 if (ap.modified()) {
1450 continue;
1451 }
1452
1453 return reg;
1454 }
1455
1456 return AsmReg::make_invalid();
1457}
1458
1459template <IRAdaptor Adaptor,
1460 typename Derived,
1461 template <typename, typename, typename> class BaseTy,
1462 typename Config>
1463typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1464 CompilerA64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1465 Jump jmp) noexcept {
1466 switch (jmp.kind) {
1467 case Jump::Jeq: return jmp.change_kind(Jump::Jne);
1468 case Jump::Jne: return jmp.change_kind(Jump::Jeq);
1469 case Jump::Jcs: return jmp.change_kind(Jump::Jcc);
1470 case Jump::Jcc: return jmp.change_kind(Jump::Jcs);
1471 case Jump::Jmi: return jmp.change_kind(Jump::Jpl);
1472 case Jump::Jpl: return jmp.change_kind(Jump::Jmi);
1473 case Jump::Jvs: return jmp.change_kind(Jump::Jvc);
1474 case Jump::Jvc: return jmp.change_kind(Jump::Jvs);
1475 case Jump::Jhi: return jmp.change_kind(Jump::Jls);
1476 case Jump::Jls: return jmp.change_kind(Jump::Jhi);
1477 case Jump::Jge: return jmp.change_kind(Jump::Jlt);
1478 case Jump::Jlt: return jmp.change_kind(Jump::Jge);
1479 case Jump::Jgt: return jmp.change_kind(Jump::Jle);
1480 case Jump::Jle: return jmp.change_kind(Jump::Jgt);
1481 case Jump::jmp: return jmp;
1482 case Jump::Cbz: return jmp.change_kind(Jump::Cbnz);
1483 case Jump::Cbnz: return jmp.change_kind(Jump::Cbz);
1484 case Jump::Tbz: return jmp.change_kind(Jump::Tbnz);
1485 case Jump::Tbnz: return jmp.change_kind(Jump::Tbz);
1486 default: TPDE_UNREACHABLE("invalid jump kind");
1487 }
1488}
1489
1490template <IRAdaptor Adaptor,
1491 typename Derived,
1492 template <typename, typename, typename> typename BaseTy,
1493 typename Config>
1494typename CompilerA64<Adaptor, Derived, BaseTy, Config>::Jump
1495 CompilerA64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1496 Jump jmp) noexcept {
1497 switch (jmp.kind) {
1498 case Jump::Jeq: return jmp.change_kind(Jump::Jeq);
1499 case Jump::Jne: return jmp.change_kind(Jump::Jne);
1500 case Jump::Jcc: return jmp.change_kind(Jump::Jhi);
1501 case Jump::Jcs: return jmp.change_kind(Jump::Jls);
1502 case Jump::Jhi: return jmp.change_kind(Jump::Jcc);
1503 case Jump::Jls: return jmp.change_kind(Jump::Jcs);
1504 case Jump::Jge: return jmp.change_kind(Jump::Jle);
1505 case Jump::Jlt: return jmp.change_kind(Jump::Jgt);
1506 case Jump::Jgt: return jmp.change_kind(Jump::Jlt);
1507 case Jump::Jle: return jmp.change_kind(Jump::Jge);
1508 case Jump::jmp: return jmp;
1509 case Jump::Jmi:
1510 case Jump::Jpl:
1511 case Jump::Jvs:
1512 case Jump::Jvc:
1513 case Jump::Cbz:
1514 case Jump::Cbnz:
1515 case Jump::Tbz:
1516 case Jump::Tbnz:
1517 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1518 }
1519}
1520
1521template <IRAdaptor Adaptor,
1522 typename Derived,
1523 template <typename, typename, typename> typename BaseTy,
1524 typename Config>
1525void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1526 const Jump jmp,
1527 IRBlockRef target,
1528 const bool needs_split,
1529 const bool last_inst) noexcept {
1530 const auto target_idx = this->analyzer.block_idx(target);
1531 if (!needs_split || jmp.kind == Jump::jmp) {
1532 this->derived()->move_to_phi_nodes(target_idx);
1533
1534 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1535 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1536 }
1537 } else {
1538 auto tmp_label = this->assembler.label_create();
1539 generate_raw_jump(invert_jump(jmp), tmp_label);
1540
1541 this->derived()->move_to_phi_nodes(target_idx);
1542
1543 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1544
1545 this->label_place(tmp_label);
1546 }
1547}
1548
1549template <IRAdaptor Adaptor,
1550 typename Derived,
1551 template <typename, typename, typename> typename BaseTy,
1552 typename Config>
1553void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1554 Jump jmp, Assembler::Label target_label) noexcept {
1555 const auto is_pending = this->assembler.label_is_pending(target_label);
1556 this->text_writer.ensure_space(4);
1557 if (jmp.kind == Jump::jmp) {
1558 if (is_pending) {
1559 ASMNC(B, 0);
1560 this->assembler.add_unresolved_entry(target_label,
1561 this->text_writer.get_sec_ref(),
1562 this->text_writer.offset() - 4,
1563 Assembler::UnresolvedEntryKind::BR);
1564 } else {
1565 const auto label_off = this->assembler.label_offset(target_label);
1566 const auto cur_off = this->text_writer.offset();
1567 assert(cur_off >= label_off);
1568 const auto diff = cur_off - label_off;
1569 assert((diff & 0b11) == 0);
1570 assert(diff < 128 * 1024 * 1024);
1571
1572 ASMNC(B, -static_cast<ptrdiff_t>(diff) / 4);
1573 }
1574 return;
1575 }
1576
1577 if (jmp.kind == Jump::Cbz || jmp.kind == Jump::Cbnz) {
1578 u32 off = 0;
1579 if (!is_pending) {
1580 const auto label_off = this->assembler.label_offset(target_label);
1581 const auto cur_off = this->text_writer.offset();
1582 assert(cur_off >= label_off);
1583 off = cur_off - label_off;
1584 assert((off & 0b11) == 0);
1585 assert(off < 128 * 1024 * 1024);
1586 }
1587
1588 if (off <= 1024 * 1024) {
1589 auto imm19 = -static_cast<ptrdiff_t>(off) / 4;
1590 if (jmp.kind == Jump::Cbz) {
1591 if (jmp.cmp_is_32) {
1592 ASMNC(CBZw, jmp.cmp_reg, imm19);
1593 } else {
1594 ASMNC(CBZx, jmp.cmp_reg, imm19);
1595 }
1596 } else {
1597 if (jmp.cmp_is_32) {
1598 ASMNC(CBNZw, jmp.cmp_reg, imm19);
1599 } else {
1600 ASMNC(CBNZx, jmp.cmp_reg, imm19);
1601 }
1602 }
1603
1604 if (is_pending) {
1605 this->assembler.add_unresolved_entry(
1606 target_label,
1607 this->text_writer.get_sec_ref(),
1608 this->text_writer.offset() - 4,
1609 Assembler::UnresolvedEntryKind::COND_BR);
1610 }
1611 } else {
1612 assert(!is_pending);
1613 this->text_writer.ensure_space(2 * 4);
1614
1615 if (jmp.kind == Jump::Cbz) {
1616 if (jmp.cmp_is_32) { // need to jump over 2 instructions
1617 ASMNC(CBNZw, jmp.cmp_reg, 2);
1618 } else {
1619 ASMNC(CBNZx, jmp.cmp_reg, 2);
1620 }
1621 } else {
1622 if (jmp.cmp_is_32) {
1623 ASMNC(CBZw, jmp.cmp_reg, 2);
1624 } else {
1625 ASMNC(CBZx, jmp.cmp_reg, 2);
1626 }
1627 }
1628 // + 4 since we alrady wrote the cb(n)z instruction
1629 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1630 }
1631 return;
1632 }
1633
1634 if (jmp.kind == Jump::Tbz || jmp.kind == Jump::Tbnz) {
1635 u32 off = 0;
1636 if (!is_pending) {
1637 const auto label_off = this->assembler.label_offset(target_label);
1638 const auto cur_off = this->text_writer.offset();
1639 assert(cur_off >= label_off);
1640 off = cur_off - label_off;
1641 assert((off & 0b11) == 0);
1642 assert(off < 128 * 1024 * 1024);
1643 }
1644
1645 if (off <= 32 * 1024) {
1646 auto imm14 = -static_cast<ptrdiff_t>(off) / 4;
1647 if (jmp.kind == Jump::Tbz) {
1648 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, imm14);
1649 } else {
1650 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, imm14);
1651 }
1652
1653 if (is_pending) {
1654 this->assembler.add_unresolved_entry(
1655 target_label,
1656 this->text_writer.get_sec_ref(),
1657 this->text_writer.offset() - 4,
1658 Assembler::UnresolvedEntryKind::TEST_BR);
1659 }
1660 } else {
1661 assert(!is_pending);
1662 this->text_writer.ensure_space(2 * 4);
1663
1664 if (jmp.kind == Jump::Tbz) {
1665 // need to jump over 2 instructions
1666 ASMNC(TBNZ, jmp.cmp_reg, jmp.test_bit, 2);
1667 } else {
1668 ASMNC(TBZ, jmp.cmp_reg, jmp.test_bit, 2);
1669 }
1670 // + 4 since we alrady wrote the tb(n)z instruction
1671 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1672 }
1673 return;
1674 }
1675
1676 Da64Cond cond, cond_compl;
1677 switch (jmp.kind) {
1678 case Jump::Jeq:
1679 cond = DA_EQ;
1680 cond_compl = DA_NE;
1681 break;
1682 case Jump::Jne:
1683 cond = DA_NE;
1684 cond_compl = DA_EQ;
1685 break;
1686 case Jump::Jcs:
1687 cond = DA_CS;
1688 cond_compl = DA_CC;
1689 break;
1690 case Jump::Jcc:
1691 cond = DA_CC;
1692 cond_compl = DA_CS;
1693 break;
1694 case Jump::Jmi:
1695 cond = DA_MI;
1696 cond_compl = DA_PL;
1697 break;
1698 case Jump::Jpl:
1699 cond = DA_PL;
1700 cond_compl = DA_MI;
1701 break;
1702 case Jump::Jvs:
1703 cond = DA_VS;
1704 cond_compl = DA_VC;
1705 break;
1706 case Jump::Jvc:
1707 cond = DA_VC;
1708 cond_compl = DA_VS;
1709 break;
1710 case Jump::Jhi:
1711 cond = DA_HI;
1712 cond_compl = DA_LS;
1713 break;
1714 case Jump::Jls:
1715 cond = DA_LS;
1716 cond_compl = DA_HI;
1717 break;
1718 case Jump::Jge:
1719 cond = DA_GE;
1720 cond_compl = DA_LT;
1721 break;
1722 case Jump::Jlt:
1723 cond = DA_LT;
1724 cond_compl = DA_GE;
1725 break;
1726 case Jump::Jgt:
1727 cond = DA_GT;
1728 cond_compl = DA_LE;
1729 break;
1730 case Jump::Jle:
1731 cond = DA_LE;
1732 cond_compl = DA_GT;
1733 break;
1734 default: TPDE_UNREACHABLE("invalid jump kind");
1735 }
1736
1737
1738 u32 off = 0;
1739 if (!is_pending) {
1740 const auto label_off = this->assembler.label_offset(target_label);
1741 const auto cur_off = this->text_writer.offset();
1742 assert(cur_off >= label_off);
1743 off = cur_off - label_off;
1744 assert((off & 0b11) == 0);
1745 assert(off < 128 * 1024 * 1024);
1746 }
1747
1748 if (off <= 1024 * 1024) {
1749 ASMNC(BCOND, cond, -static_cast<ptrdiff_t>(off) / 4);
1750
1751 if (is_pending) {
1752 this->assembler.add_unresolved_entry(
1753 target_label,
1754 this->text_writer.get_sec_ref(),
1755 this->text_writer.offset() - 4,
1756 Assembler::UnresolvedEntryKind::COND_BR);
1757 }
1758 } else {
1759 assert(!is_pending);
1760 this->text_writer.ensure_space(2 * 4);
1761
1762 // 2 to skip over the branch following
1763 ASMNC(BCOND, cond_compl, 2);
1764 // + 4 since we alrady wrote the branch instruction
1765 ASMNC(B, -static_cast<ptrdiff_t>(off + 4) / 4);
1766 }
1767}
1768
1769template <IRAdaptor Adaptor,
1770 typename Derived,
1771 template <typename, typename, typename> class BaseTy,
1772 typename Config>
1773void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1774 Jump jmp, AsmReg dst) noexcept {
1775 this->text_writer.ensure_space(4);
1776 switch (jmp.kind) {
1777 case Jump::Jeq: ASMNC(CSETw, dst, DA_EQ); break;
1778 case Jump::Jne: ASMNC(CSETw, dst, DA_NE); break;
1779 case Jump::Jcs: ASMNC(CSETw, dst, DA_CS); break;
1780 case Jump::Jcc: ASMNC(CSETw, dst, DA_CC); break;
1781 case Jump::Jmi: ASMNC(CSETw, dst, DA_MI); break;
1782 case Jump::Jpl: ASMNC(CSETw, dst, DA_PL); break;
1783 case Jump::Jvs: ASMNC(CSETw, dst, DA_VS); break;
1784 case Jump::Jvc: ASMNC(CSETw, dst, DA_VC); break;
1785 case Jump::Jhi: ASMNC(CSETw, dst, DA_HI); break;
1786 case Jump::Jls: ASMNC(CSETw, dst, DA_LS); break;
1787 case Jump::Jge: ASMNC(CSETw, dst, DA_GE); break;
1788 case Jump::Jlt: ASMNC(CSETw, dst, DA_LT); break;
1789 case Jump::Jgt: ASMNC(CSETw, dst, DA_GT); break;
1790 case Jump::Jle: ASMNC(CSETw, dst, DA_LE); break;
1791 case Jump::jmp: ASMNC(CSETw, dst, DA_AL); break;
1792 default: TPDE_UNREACHABLE("invalid condition for set/mask");
1793 }
1794}
1795
1796template <IRAdaptor Adaptor,
1797 typename Derived,
1798 template <typename, typename, typename> class BaseTy,
1799 typename Config>
1800void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1801 Jump jmp, AsmReg dst) noexcept {
1802 this->text_writer.ensure_space(4);
1803 switch (jmp.kind) {
1804 case Jump::Jeq: ASMNC(CSETMx, dst, DA_EQ); break;
1805 case Jump::Jne: ASMNC(CSETMx, dst, DA_NE); break;
1806 case Jump::Jcs: ASMNC(CSETMx, dst, DA_CS); break;
1807 case Jump::Jcc: ASMNC(CSETMx, dst, DA_CC); break;
1808 case Jump::Jmi: ASMNC(CSETMx, dst, DA_MI); break;
1809 case Jump::Jpl: ASMNC(CSETMx, dst, DA_PL); break;
1810 case Jump::Jvs: ASMNC(CSETMx, dst, DA_VS); break;
1811 case Jump::Jvc: ASMNC(CSETMx, dst, DA_VC); break;
1812 case Jump::Jhi: ASMNC(CSETMx, dst, DA_HI); break;
1813 case Jump::Jls: ASMNC(CSETMx, dst, DA_LS); break;
1814 case Jump::Jge: ASMNC(CSETMx, dst, DA_GE); break;
1815 case Jump::Jlt: ASMNC(CSETMx, dst, DA_LT); break;
1816 case Jump::Jgt: ASMNC(CSETMx, dst, DA_GT); break;
1817 case Jump::Jle: ASMNC(CSETMx, dst, DA_LE); break;
1818 case Jump::jmp: ASMNC(CSETMx, dst, DA_AL); break;
1819 default: TPDE_UNREACHABLE("invalid condition for set/mask");
1820 }
1821}
1822
1823template <IRAdaptor Adaptor,
1824 typename Derived,
1825 template <typename, typename, typename> class BaseTy,
1826 typename Config>
1827void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1828 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1829 assert(from < to && to <= 64);
1830 (void)to;
1831 if (sign) {
1832 if (to <= 32) {
1833 ASM(SBFXw, dst, src, 0, from);
1834 } else {
1835 ASM(SBFXx, dst, src, 0, from);
1836 }
1837 } else {
1838 if (to <= 32) {
1839 ASM(UBFXw, dst, src, 0, from);
1840 } else {
1841 ASM(UBFXx, dst, src, 0, from);
1842 }
1843 }
1844}
1845
1846template <IRAdaptor Adaptor,
1847 typename Derived,
1848 template <typename, typename, typename> typename BaseTy,
1849 typename Config>
1850void CompilerA64<Adaptor, Derived, BaseTy, Config>::generate_call(
1851 std::variant<Assembler::SymRef, ValuePart> &&target,
1852 std::span<CallArg> arguments,
1853 typename Base::ValueRef *result,
1854 bool) {
1855 CCAssignerAAPCS assigner;
1856 CallBuilder cb{*derived(), assigner};
1857 for (auto &arg : arguments) {
1858 cb.add_arg(std::move(arg));
1859 }
1860 cb.call(std::move(target));
1861 if (result) {
1862 cb.add_ret(*result);
1863 }
1864}
1865
1866template <IRAdaptor Adaptor,
1867 typename Derived,
1868 template <typename, typename, typename> typename BaseTy,
1869 typename Config>
1870CompilerA64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1871 CompilerA64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1872 Assembler::SymRef sym, TLSModel model) noexcept {
1873 switch (model) {
1874 default: // TODO: implement optimized access for non-gd-model
1875 case TLSModel::GlobalDynamic: {
1876 ScratchReg r0_scratch{this};
1877 AsmReg r0 = r0_scratch.alloc_specific(AsmReg::R0);
1878 ScratchReg r1_scratch{this};
1879 AsmReg r1 = r1_scratch.alloc_specific(AsmReg::R1);
1880 // The call only clobbers flags, x0, x1, and lr. x0 and x1 are already fixed
1881 // in the scratch registers, so only make sure that lr isn't used otherwise.
1882 if (this->register_file.is_used(Reg{AsmReg::LR})) {
1883 this->evict_reg(Reg{AsmReg::LR});
1884 }
1885
1886 this->text_writer.ensure_space(0x18);
1887 this->reloc_text(
1888 sym, R_AARCH64_TLSDESC_ADR_PAGE21, this->text_writer.offset(), 0);
1889 ASMNC(ADRP, r0, 0, 0);
1890 this->reloc_text(
1891 sym, R_AARCH64_TLSDESC_LD64_LO12, this->text_writer.offset(), 0);
1892 ASMNC(LDRxu, r1, r0, 0);
1893 this->reloc_text(
1894 sym, R_AARCH64_TLSDESC_ADD_LO12, this->text_writer.offset(), 0);
1895 ASMNC(ADDxi, r0, r0, 0);
1896 this->reloc_text(
1897 sym, R_AARCH64_TLSDESC_CALL, this->text_writer.offset(), 0);
1898 ASMNC(BLR, r1);
1899 ASMNC(MRS, r1, 0xde82); // TPIDR_EL0
1900 // TODO: maybe return expr x0+x1.
1901 ASMNC(ADDx, r0, r1, r0);
1902 return r0_scratch;
1903 }
1904 }
1905}
1906
1907} // namespace tpde::a64