6#include "AssemblerElfX64.hpp"
7#include "tpde/CompilerBase.hpp"
8#include "tpde/ValLocalIdx.hpp"
9#include "tpde/ValueAssignment.hpp"
10#include "tpde/base.hpp"
19#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
20 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
25#define ASM_FULL(compiler, reserve, op, ...) \
26 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
28#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
29#define ASMC(compiler, op, ...) \
30 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
31#define ASMF(op, flag, ...) \
32 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
33#define ASMNCF(op, flag, ...) \
34 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
35#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
77 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
79 constexpr AsmReg(
const REG
id) noexcept : Reg((u8)
id) {}
81 constexpr AsmReg(
const Reg base) noexcept : Reg(base) {}
83 constexpr explicit AsmReg(
const u8
id) noexcept : Reg(
id) {
84 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
87 constexpr explicit AsmReg(
const u64
id) noexcept : Reg(
id) {
88 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
91 constexpr operator FeRegGP() const noexcept {
92 assert(reg_id <= R15);
93 return FeRegGP{reg_id};
96 operator FeRegGPLH() const noexcept {
97 assert(reg_id <= R15);
98 return FeRegGP{reg_id};
101 constexpr operator FeRegXMM() const noexcept {
102 assert(reg_id >= XMM0 && reg_id <= XMM15);
103 return FeRegXMM{
static_cast<u8
>(reg_id & 0x1F)};
108 create_bitmask(
const std::initializer_list<AsmReg::REG> regs) {
110 for (
const auto reg : regs) {
117constexpr static u64 create_bitmask(
const std::array<AsmReg, N> regs) {
119 for (
const auto reg : regs) {
120 set |= 1ull << reg.id();
125class CCAssignerSysV :
public CCAssigner {
127 static constexpr CCInfo Info{
129 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
130 .callee_saved_regs = create_bitmask({
137 .arg_regs = create_bitmask({
156 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
158 unsigned must_assign_stack = 0;
160 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
163 CCAssignerSysV(
bool vararg =
false) noexcept
164 : CCAssigner(Info), vararg(vararg) {}
166 void reset() noexcept
override {
167 gp_cnt = xmm_cnt = stack = 0;
168 must_assign_stack = 0;
170 ret_gp_cnt = ret_xmm_cnt = 0;
173 void assign_arg(CCAssignment &arg)
noexcept override {
175 stack = util::align_up(stack, arg.byval_align < 8 ? 8 : arg.byval_align);
176 arg.stack_off = stack;
177 stack += arg.byval_size;
181 if (arg.bank == RegBank{0}) {
182 static constexpr std::array<AsmReg, 6> gp_arg_regs{
190 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
191 arg.reg = gp_arg_regs[gp_cnt];
196 must_assign_stack = arg.consecutive + 1;
197 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
198 arg.stack_off = stack;
202 if (!must_assign_stack && xmm_cnt < 8) {
203 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
208 must_assign_stack = arg.consecutive + 1;
209 u32 size = util::align_up(arg.size, 8);
210 stack = util::align_up(stack, size);
211 arg.stack_off = stack;
216 if (must_assign_stack > 0) {
217 must_assign_stack -= 1;
221 u32 get_stack_size() noexcept
override {
return stack; }
223 bool is_vararg() const noexcept
override {
return vararg; }
225 void assign_ret(CCAssignment &arg)
noexcept override {
226 assert(!arg.byval && !arg.sret);
227 if (arg.bank == RegBank{0}) {
228 if (ret_gp_cnt + arg.consecutive < 2) {
229 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
235 if (ret_xmm_cnt + arg.consecutive < 2) {
236 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
245struct PlatformConfig : CompilerConfigDefault {
246 using Assembler = AssemblerElfX64;
247 using AsmReg = tpde::x64::AsmReg;
248 using DefaultCCAssigner = CCAssignerSysV;
250 static constexpr RegBank GP_BANK{0};
251 static constexpr RegBank FP_BANK{1};
252 static constexpr bool FRAME_INDEXING_NEGATIVE =
true;
253 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
254 static constexpr u32 NUM_BANKS = 2;
258template <
typename T,
typename Config>
259concept Compiler = tpde::Compiler<T, Config> &&
requires(T a) {
261 a.arg_is_int128(std::declval<typename T::IRValueRef>())
262 } -> std::convertible_to<bool>;
265 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
266 } -> std::convertible_to<bool>;
270template <IRAdaptor Adaptor,
272 template <
typename,
typename,
typename>
typename BaseTy =
274 typename Config = PlatformConfig>
275struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
276 using Base = BaseTy<Adaptor, Derived, Config>;
278 using IRValueRef =
typename Base::IRValueRef;
279 using IRBlockRef =
typename Base::IRBlockRef;
280 using IRFuncRef =
typename Base::IRFuncRef;
282 using ScratchReg =
typename Base::ScratchReg;
283 using ValuePartRef =
typename Base::ValuePartRef;
284 using ValuePart =
typename Base::ValuePart;
285 using GenericValuePart =
typename Base::GenericValuePart;
287 using Assembler =
typename PlatformConfig::Assembler;
288 using RegisterFile =
typename Base::RegisterFile;
290 using CallArg =
typename Base::CallArg;
297 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
300 enum CPU_FEATURES : u32 {
302 CPU_CMPXCHG16B = (1 << 0),
303 CPU_POPCNT = (1 << 1),
305 CPU_SSSE3 = (1 << 3),
306 CPU_SSE4_1 = (1 << 4),
307 CPU_SSE4_2 = (1 << 5),
312 CPU_F16C = (1 << 10),
314 CPU_LZCNT = (1 << 12),
315 CPU_MOVBE = (1 << 13),
316 CPU_AVX512F = (1 << 14),
317 CPU_AVX512BW = (1 << 15),
318 CPU_AVX512CD = (1 << 16),
319 CPU_AVX512DQ = (1 << 17),
320 CPU_AVX512VL = (1 << 18),
322 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
323 CPU_SSE4_1 | CPU_SSE4_2,
324 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
325 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
326 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
330 CPU_FEATURES cpu_feats = CPU_BASELINE;
339 u64 fixed_assignment_nonallocatable_mask =
340 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
341 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
342 func_reg_restore_alloc = 0u;
344 u32 frame_size_setup_offset = 0u;
347 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
348 u32 reg_save_frame_off = 0;
349 u32 var_arg_stack_off = 0;
350 util::SmallVector<u32, 8> func_ret_offs = {};
353 Assembler::SymRef sym_tls_get_addr;
355 class CallBuilder :
public Base::template CallBuilderBase<CallBuilder> {
356 u32 stack_adjust_off = 0;
358 void set_stack_used() noexcept;
361 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
362 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
364 void add_arg_byval(ValuePart &vp, CCAssignment &cca)
noexcept;
365 void add_arg_stack(ValuePart &vp, CCAssignment &cca)
noexcept;
367 std::variant<typename Assembler::SymRef, ValuePart> &&target)
noexcept;
368 void reset_stack() noexcept;
372 explicit CompilerX64(Adaptor *adaptor,
373 const CPU_FEATURES cpu_features = CPU_BASELINE)
374 : Base{adaptor}, cpu_feats(cpu_features) {
375 static_assert(std::is_base_of_v<CompilerX64, Derived>);
376 static_assert(concepts::Compiler<Derived, PlatformConfig>);
379 template <
typename... Args>
380 auto asm_helper(
unsigned (*fn)(u8 *,
int, Args...)) {
382 CompilerX64 *compiler;
384 void encode(
unsigned reserve,
int flags, Args... args) {
386 compiler->text_writer.ensure_space(reserve);
388 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
390 compiler->text_writer.cur_ptr() += n;
393 return Helper{
this, fn};
396 void start_func(u32 func_idx)
noexcept;
398 void gen_func_prolog_and_args(CCAssigner *)
noexcept;
400 void finish_func(u32 func_idx)
noexcept;
402 void reset() noexcept;
406 void gen_func_epilog() noexcept;
409 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
411 void load_from_stack(AsmReg dst,
414 bool sign_extend = false) noexcept;
416 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
418 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
420 GenericValuePart val_spill_slot(ValuePart &val_ref) noexcept {
421 const auto ap = val_ref.assignment();
422 assert(ap.stack_valid() && !ap.variable_ref());
423 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
426 AsmReg gval_expr_as_reg(GenericValuePart &gv)
noexcept;
428 void materialize_constant(
const u64 *data,
431 AsmReg dst)
noexcept;
433 AsmReg select_fixed_assignment_reg(RegBank bank, IRValueRef)
noexcept;
455 Jump invert_jump(Jump jmp)
noexcept;
456 Jump swap_jump(Jump jmp)
noexcept;
458 void generate_branch_to_block(Jump jmp,
461 bool last_inst)
noexcept;
463 void generate_raw_jump(Jump jmp, Assembler::Label target)
noexcept;
465 void generate_raw_set(Jump jmp, AsmReg dst)
noexcept;
466 void generate_raw_mask(Jump jmp, AsmReg dst)
noexcept;
468 void generate_raw_intext(
469 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to)
noexcept;
481 void generate_call(std::variant<Assembler::SymRef, ValuePart> &&target,
482 std::span<CallArg> arguments,
483 typename Base::ValueRef *result,
484 bool variable_args =
false);
488 ScratchReg tls_get_addr(Assembler::SymRef sym, TLSModel model)
noexcept;
490 bool has_cpu_feats(CPU_FEATURES feats)
const noexcept {
491 return ((cpu_feats & feats) == feats);
495template <IRAdaptor Adaptor,
497 template <
typename,
typename,
typename>
class BaseTy,
499void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
500 const u32 )
noexcept {
501 this->text_writer.align(16);
502 this->assembler.except_begin_func();
505template <IRAdaptor Adaptor,
507 template <
typename,
typename,
typename>
typename BaseTy,
509void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
510 CCAssigner *cc_assigner)
noexcept {
526 func_ret_offs.clear();
527 func_start_off = this->text_writer.offset();
528 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
530 const CCInfo &cc_info = cc_assigner->get_ccinfo();
533 ASM(MOV64rr, FE_BP, FE_SP);
535 func_reg_save_off = this->text_writer.offset();
537 auto csr = cc_info.callee_saved_regs;
538 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
539 "non-gp callee-saved registers not implemented");
541 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
542 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
544 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
545 this->stack.frame_size = 8 * (csr_logp + csr_higp);
547 this->text_writer.ensure_space(reg_save_size);
548 this->text_writer.cur_ptr() += reg_save_size;
549 func_reg_save_alloc = reg_save_size;
551 func_reg_restore_alloc = reg_save_size;
556 frame_size_setup_offset = this->text_writer.offset();
557 ASM(SUB64ri, FE_SP, 0x7FFF'FFFF);
559 assert((this->text_writer.offset() - frame_size_setup_offset) == 7);
562 if (this->adaptor->cur_is_vararg()) {
563 this->stack.frame_size += 6 * 8 + 8 * 16;
564 reg_save_frame_off = this->stack.frame_size;
565 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
566 ASM(MOV64mr, mem, FE_DI);
568 ASM(MOV64mr, mem, FE_SI);
570 ASM(MOV64mr, mem, FE_DX);
572 ASM(MOV64mr, mem, FE_CX);
574 ASM(MOV64mr, mem, FE_R8);
576 ASM(MOV64mr, mem, FE_R9);
577 auto skip_fp = this->assembler.label_create();
578 ASM(TEST8rr, FE_AX, FE_AX);
579 generate_raw_jump(Jump::je, skip_fp);
581 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
583 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
585 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
587 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
589 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
591 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
593 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
595 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
596 this->label_place(skip_fp);
600 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
601 "argument registers must also be allocatable");
602 this->register_file.allocatable &= ~cc_info.arg_regs;
605 for (
const IRValueRef arg : this->adaptor->cur_args()) {
606 derived()->handle_func_arg(
607 arg_idx, arg, [&](ValuePart &&vp, CCAssignment cca) {
608 cca.bank = vp.bank();
609 cca.size = vp.part_size();
611 cc_assigner->assign_arg(cca);
613 if (cca.reg.valid()) [[likely]] {
614 vp.set_value_reg(this, cca.reg);
618 this->register_file.allocatable |= u64{1} << cca.reg.id();
628 ValLocalIdx local_idx = this->val_idx(arg);
630 this->assignments.value_ptrs[u32(local_idx)] =
nullptr;
631 this->init_variable_ref(local_idx, 0);
632 ValueAssignment *assignment = this->val_assignment(local_idx);
633 assignment->stack_variable =
true;
634 assignment->frame_off = 0x10 + cca.stack_off;
640 AsmReg dst = vp.alloc_reg(
this);
641 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
648 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
650 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
651 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
652 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
653 this->scalar_arg_count = std::popcount(gp_regs);
654 this->vec_arg_count = std::popcount(xmm_regs);
655 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
658 this->register_file.allocatable |= cc_info.arg_regs;
661template <IRAdaptor Adaptor,
663 template <
typename,
typename,
typename>
typename BaseTy,
665void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
666 u32 func_idx)
noexcept {
668 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
670 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
671 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
672 this->assembler.eh_write_inst(
673 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
675 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
676 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
677 dwarf::x64::DW_reg_rbp);
680 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
681 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
683 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
684 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
685 u64 saved_regs = this->register_file.clobbered & csr;
686 u32 num_saved_regs = 0u;
687 for (
auto reg : util::BitSetIterator{saved_regs}) {
688 assert(reg <= AsmReg::R15);
690 fe64_PUSHr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
696 static const u8 gpreg_to_dwarf[] = {
697 dwarf::x64::DW_reg_rax,
698 dwarf::x64::DW_reg_rcx,
699 dwarf::x64::DW_reg_rdx,
700 dwarf::x64::DW_reg_rbx,
701 dwarf::x64::DW_reg_rsp,
702 dwarf::x64::DW_reg_rbp,
703 dwarf::x64::DW_reg_rsi,
704 dwarf::x64::DW_reg_rdi,
705 dwarf::x64::DW_reg_r8,
706 dwarf::x64::DW_reg_r9,
707 dwarf::x64::DW_reg_r10,
708 dwarf::x64::DW_reg_r11,
709 dwarf::x64::DW_reg_r12,
710 dwarf::x64::DW_reg_r13,
711 dwarf::x64::DW_reg_r14,
712 dwarf::x64::DW_reg_r15,
714 u8 dwarf_reg = gpreg_to_dwarf[reg];
715 auto cfa_off = num_saved_regs + 2;
716 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
720 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
721 assert(prologue_size < 0x44);
722 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
723 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
727 const auto final_frame_size =
728 util::align_up(this->stack.frame_size, 16) - num_saved_regs * 8;
729 *
reinterpret_cast<u32 *
>(this->text_writer.begin_ptr() +
730 frame_size_setup_offset + 3) = final_frame_size;
733 assert(fd_decode(this->text_writer.begin_ptr() + frame_size_setup_offset,
738 assert(FD_TYPE(&instr) == FDI_SUB);
739 assert(FD_OP_TYPE(&instr, 0) == FD_OT_REG);
740 assert(FD_OP_TYPE(&instr, 1) == FD_OT_IMM);
741 assert(FD_OP_SIZE(&instr, 0) == 8);
742 assert(FD_OP_SIZE(&instr, 1) == 8);
743 assert(FD_OP_IMM(&instr, 1) == final_frame_size);
747 const auto reg_save_end =
748 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
749 assert(reg_save_end >= write_ptr);
750 const u32 nop_len = reg_save_end - write_ptr;
752 fe64_NOP(write_ptr, nop_len);
755 auto func_sym = this->func_syms[func_idx];
756 auto func_sec = this->text_writer.get_sec_ref();
757 if (func_ret_offs.empty()) {
759 auto func_size = this->text_writer.offset() - func_start_off;
760 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
761 this->assembler.eh_end_fde(fde_off, func_sym);
762 this->assembler.except_encode_func(func_sym);
766 auto *text_data = this->text_writer.begin_ptr();
767 u32 first_ret_off = func_ret_offs[0];
769 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc;
770 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
772 write_ptr = text_data + first_ret_off;
773 const auto ret_start = write_ptr;
774 if (this->adaptor->cur_has_dynamic_alloca()) {
775 if (num_saved_regs == 0) {
776 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
779 fe64_LEA64rm(write_ptr,
782 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
785 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, final_frame_size);
787 for (
auto reg : util::BitSetIterator<true>{saved_regs}) {
788 assert(reg <= AsmReg::R15);
790 fe64_POPr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
792 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
793 write_ptr += fe64_RET(write_ptr, 0);
794 ret_size = write_ptr - ret_start;
795 assert(ret_size <= epilogue_size &&
"function epilogue too long");
798 if (epilogue_size > ret_size) {
799 fe64_NOP(write_ptr, epilogue_size - ret_size);
800 if (first_ret_off == func_end_ret_off) {
801 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
806 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
808 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
809 if (func_ret_offs[i] == func_end_ret_off) {
810 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
817 auto func_size = this->text_writer.offset() - func_start_off;
818 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
819 this->assembler.eh_end_fde(fde_off, func_sym);
820 this->assembler.except_encode_func(func_sym);
823template <IRAdaptor Adaptor,
825 template <
typename,
typename,
typename>
typename BaseTy,
827void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
828 func_ret_offs.clear();
829 sym_tls_get_addr = {};
833template <IRAdaptor Adaptor,
835 template <
typename,
typename,
typename>
typename BaseTy,
837void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
851 func_ret_offs.push_back(this->text_writer.offset());
859 func_reg_restore_alloc;
861 this->text_writer.ensure_space(epilogue_size);
862 this->text_writer.cur_ptr() += epilogue_size;
865template <IRAdaptor Adaptor,
867 template <
typename,
typename,
typename>
typename BaseTy,
869void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
870 const AsmReg reg,
const i32 frame_off,
const u32 size)
noexcept {
871 this->text_writer.ensure_space(16);
872 assert(frame_off < 0);
873 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
874 if (reg.id() <= AsmReg::R15) {
876 case 1: ASMNC(MOV8mr, mem, reg);
break;
877 case 2: ASMNC(MOV16mr, mem, reg);
break;
878 case 4: ASMNC(MOV32mr, mem, reg);
break;
879 case 8: ASMNC(MOV64mr, mem, reg);
break;
880 default: TPDE_UNREACHABLE(
"invalid spill size");
886 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg);
break;
887 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg);
break;
888 case 16: ASMNC(SSE_MOVAPDmr, mem, reg);
break;
889 default: TPDE_UNREACHABLE(
"invalid spill size");
893template <IRAdaptor Adaptor,
895 template <
typename,
typename,
typename>
typename BaseTy,
897void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
901 const bool sign_extend)
noexcept {
902 this->text_writer.ensure_space(16);
903 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
905 if (dst.id() <= AsmReg::R15) {
908 case 1: ASMNC(MOVZXr32m8, dst, mem);
break;
909 case 2: ASMNC(MOVZXr32m16, dst, mem);
break;
910 case 4: ASMNC(MOV32rm, dst, mem);
break;
911 case 8: ASMNC(MOV64rm, dst, mem);
break;
912 default: TPDE_UNREACHABLE(
"invalid spill size");
916 case 1: ASMNC(MOVSXr64m8, dst, mem);
break;
917 case 2: ASMNC(MOVSXr64m16, dst, mem);
break;
918 case 4: ASMNC(MOVSXr64m32, dst, mem);
break;
919 case 8: ASMNC(MOV64rm, dst, mem);
break;
920 default: TPDE_UNREACHABLE(
"invalid spill size");
926 assert(!sign_extend);
929 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem);
break;
930 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem);
break;
931 case 16: ASMNC(SSE_MOVAPDrm, dst, mem);
break;
932 default: TPDE_UNREACHABLE(
"invalid spill size");
936template <IRAdaptor Adaptor,
938 template <
typename,
typename,
typename>
typename BaseTy,
940void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
941 const AsmReg dst,
const AssignmentPartRef ap)
noexcept {
942 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
945template <IRAdaptor Adaptor,
947 template <
typename,
typename,
typename>
typename BaseTy,
949void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
950 const AsmReg dst,
const AsmReg src,
const u32 size)
noexcept {
953 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
955 ASM(MOV64rr, dst, src);
957 ASM(MOV32rr, dst, src);
959 }
else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
961 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
962 assert(has_cpu_feats(CPU_AVX512F));
963 ASM(VMOVAPD128rr, dst, src);
965 ASM(SSE_MOVAPDrr, dst, src);
967 }
else if (size <= 32) {
968 assert(has_cpu_feats(CPU_AVX));
969 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
970 has_cpu_feats(CPU_AVX512F));
971 ASM(VMOVAPD256rr, dst, src);
974 assert(has_cpu_feats(CPU_AVX512F));
975 ASM(VMOVAPD512rr, dst, src);
977 }
else if (dst.id() <= AsmReg::R15) {
979 assert(src.id() >= AsmReg::XMM0);
981 if (src.id() > AsmReg::XMM15) {
982 assert(has_cpu_feats(CPU_AVX512F));
984 ASM(VMOVD_X2Grr, dst, src);
986 ASM(VMOVQ_X2Grr, dst, src);
990 ASM(SSE_MOVD_X2Grr, dst, src);
992 ASM(SSE_MOVQ_X2Grr, dst, src);
997 assert(src.id() <= AsmReg::R15);
998 assert(dst.id() >= AsmReg::XMM0);
1000 if (dst.id() > AsmReg::XMM15) {
1001 assert(has_cpu_feats(CPU_AVX512F));
1003 ASM(VMOVD_G2Xrr, dst, src);
1005 ASM(VMOVQ_G2Xrr, dst, src);
1009 ASM(SSE_MOVD_G2Xrr, dst, src);
1011 ASM(SSE_MOVQ_G2Xrr, dst, src);
1017template <IRAdaptor Adaptor,
1019 template <
typename,
typename,
typename>
typename BaseTy,
1021AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1022 GenericValuePart &gv)
noexcept {
1023 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1025 ScratchReg scratch{derived()};
1026 bool disp32 = i32(expr.disp) == expr.disp;
1027 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1028 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1029 if (std::holds_alternative<ScratchReg>(expr.base)) {
1030 scratch = std::move(std::get<ScratchReg>(expr.base));
1031 }
else if (std::holds_alternative<ScratchReg>(expr.index)) {
1032 scratch = std::move(std::get<ScratchReg>(expr.index));
1034 (void)scratch.alloc_gp();
1036 auto dst = scratch.cur_reg();
1038 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1040 if (base.valid() && disp32) {
1041 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1043 }
else if (base.valid()) {
1044 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1045 }
else if (disp32) {
1046 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1048 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1051 u64 scale = expr.scale;
1053 base = AsmReg::make_invalid();
1057 ScratchReg idx_scratch{derived()};
1059 AsmReg idx_tmp = dst;
1060 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1063 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1064 }
else if (dst == base) {
1065 idx_tmp = idx_scratch.alloc_gp();
1068 if ((scale & (scale - 1)) == 0) {
1069 if (idx_tmp != idx) {
1070 ASM(MOV64rr, idx_tmp, idx);
1072 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1074 if (i32(scale) == i64(scale)) {
1075 ASM(IMUL64rri, idx_tmp, idx, scale);
1077 ScratchReg scratch2{derived()};
1078 auto tmp2 = scratch2.alloc_gp();
1079 ASM(MOV64ri, tmp2, scale);
1080 if (idx_tmp != idx) {
1081 ASM(MOV64rr, idx_tmp, idx);
1083 ASM(IMUL64rr, idx_tmp, tmp2);
1087 if (disp32 || (idx_tmp != dst && base != dst)) {
1088 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1090 }
else if (dst == base) {
1091 ASM(ADD64rr, dst, idx_tmp);
1093 ASM(ADD64rr, dst, base);
1097 }
else if (base.valid()) {
1098 if (expr.disp && disp32) {
1099 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1101 }
else if (dst != base) {
1102 ASM(MOV64rr, dst, base);
1106 ScratchReg scratch2{derived()};
1107 auto tmp2 = scratch2.alloc_gp();
1108 ASM(MOV64ri, tmp2, expr.disp);
1109 ASM(ADD64rr, dst, tmp2);
1111 gv.state = std::move(scratch);
1115template <IRAdaptor Adaptor,
1117 template <
typename,
typename,
typename>
typename BaseTy,
1119void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1120 const u64 *data,
const RegBank bank,
const u32 size, AsmReg dst)
noexcept {
1121 const auto const_u64 = data[0];
1122 if (bank == Config::GP_BANK) {
1124 if (const_u64 == 0) {
1128 ASM(MOV32ri, dst, 0);
1133 ASM(MOV32ri, dst, const_u64);
1135 ASM(MOV64ri, dst, const_u64);
1140 assert(bank == Config::FP_BANK);
1141 const auto high_u64 = size <= 8 ? 0 : data[1];
1142 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1143 if (has_cpu_feats(CPU_AVX)) {
1144 ASM(VPXOR128rrr, dst, dst, dst);
1146 ASM(SSE_PXORrr, dst, dst);
1150 const u64 ones = -u64{1};
1151 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1152 if (has_cpu_feats(CPU_AVX)) {
1153 ASM(VPCMPEQB128rrr, dst, dst, dst);
1155 ASM(SSE_PCMPEQBrr, dst, dst);
1164 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1166 this->register_file.mark_clobbered(tmp);
1167 materialize_constant(data, Config::GP_BANK, size, tmp);
1169 if (has_cpu_feats(CPU_AVX)) {
1170 ASM(VMOVD_G2Xrr, dst, tmp);
1172 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1175 if (has_cpu_feats(CPU_AVX)) {
1176 ASM(VMOVQ_G2Xrr, dst, tmp);
1178 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1187 auto alloc_size = util::align_up(size, 8);
1188 std::span<const u8> raw_data{
reinterpret_cast<const u8 *
>(data), alloc_size};
1190 auto rodata = this->assembler.get_data_section(
true,
false);
1191 auto sym = this->assembler.sym_def_data(
1192 rodata,
"", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1194 if (has_cpu_feats(CPU_AVX)) {
1195 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1197 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1199 }
else if (size <= 8) {
1200 if (has_cpu_feats(CPU_AVX)) {
1201 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1203 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1205 }
else if (size <= 16) {
1206 if (has_cpu_feats(CPU_AVX)) {
1207 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1209 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1213 TPDE_FATAL(
"unable to materialize constant");
1216 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1219template <IRAdaptor Adaptor,
1221 template <
typename,
typename,
typename>
typename BaseTy,
1224 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1225 const RegBank bank, IRValueRef)
noexcept {
1226 assert(bank.id() <= Config::NUM_BANKS);
1227 auto reg_mask = this->register_file.bank_regs(bank);
1228 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1230 const auto find_possible_regs = [
this,
1231 reg_mask](
const u64 preferred_regs) -> u64 {
1233 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1234 u64 possible_regs = free_regs & preferred_regs & reg_mask;
1235 if (possible_regs == 0) {
1236 possible_regs = (this->register_file.used & ~this->register_file.fixed) &
1237 preferred_regs & reg_mask;
1239 return possible_regs;
1243 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1244 if (derived()->cur_func_may_emit_calls()) {
1246 possible_regs = find_possible_regs(csr);
1250 possible_regs = find_possible_regs(~csr);
1251 if (possible_regs == 0) {
1253 possible_regs = find_possible_regs(csr);
1257 if (possible_regs == 0) {
1258 return AsmReg::make_invalid();
1262 if ((possible_regs & ~this->register_file.used) != 0) {
1263 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1266 for (
const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1267 const auto reg = AsmReg{reg_id};
1269 if (this->register_file.is_fixed(reg)) {
1273 const auto local_idx = this->register_file.reg_local_idx(reg);
1274 const auto part = this->register_file.reg_part(reg);
1276 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1279 auto *assignment = this->val_assignment(local_idx);
1280 auto ap = AssignmentPartRef{assignment, part};
1281 if (ap.modified()) {
1288 return AsmReg::make_invalid();
1291template <IRAdaptor Adaptor,
1293 template <
typename,
typename,
typename>
typename BaseTy,
1295typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1296 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1297 Jump jmp)
noexcept {
1299 case Jump::ja:
return Jump::jbe;
1300 case Jump::jae:
return Jump::jb;
1301 case Jump::jb:
return Jump::jae;
1302 case Jump::jbe:
return Jump::ja;
1303 case Jump::je:
return Jump::jne;
1304 case Jump::jg:
return Jump::jle;
1305 case Jump::jge:
return Jump::jl;
1306 case Jump::jl:
return Jump::jge;
1307 case Jump::jle:
return Jump::jg;
1308 case Jump::jmp:
return Jump::jmp;
1309 case Jump::jne:
return Jump::je;
1310 case Jump::jno:
return Jump::jo;
1311 case Jump::jo:
return Jump::jno;
1312 case Jump::js:
return Jump::jns;
1313 case Jump::jns:
return Jump::js;
1314 case Jump::jp:
return Jump::jnp;
1315 case Jump::jnp:
return Jump::jp;
1316 default: TPDE_UNREACHABLE(
"invalid jump condition");
1320template <IRAdaptor Adaptor,
1322 template <
typename,
typename,
typename>
class BaseTy,
1324typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1325 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1326 Jump jmp)
noexcept {
1328 case Jump::ja:
return Jump::jb;
1329 case Jump::jae:
return Jump::jbe;
1330 case Jump::jb:
return Jump::ja;
1331 case Jump::jbe:
return Jump::jae;
1332 case Jump::je:
return Jump::je;
1333 case Jump::jg:
return Jump::jl;
1334 case Jump::jge:
return Jump::jle;
1335 case Jump::jl:
return Jump::jg;
1336 case Jump::jle:
return Jump::jge;
1337 case Jump::jmp:
return Jump::jmp;
1338 case Jump::jne:
return Jump::jne;
1339 case Jump::jno:
return Jump::jno;
1340 case Jump::jo:
return Jump::jo;
1341 case Jump::js:
return Jump::js;
1342 case Jump::jns:
return Jump::jns;
1343 case Jump::jp:
return Jump::jp;
1344 case Jump::jnp:
return Jump::jnp;
1345 default: TPDE_UNREACHABLE(
"invalid jump condition");
1349template <IRAdaptor Adaptor,
1351 template <
typename,
typename,
typename>
typename BaseTy,
1353void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1356 const bool needs_split,
1357 const bool last_inst)
noexcept {
1358 const auto target_idx = this->analyzer.block_idx(target);
1359 if (!needs_split || jmp == Jump::jmp) {
1360 this->derived()->move_to_phi_nodes(target_idx);
1362 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1363 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1366 auto tmp_label = this->assembler.label_create();
1367 generate_raw_jump(invert_jump(jmp), tmp_label);
1369 this->derived()->move_to_phi_nodes(target_idx);
1371 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1373 this->label_place(tmp_label);
1377template <IRAdaptor Adaptor,
1379 template <
typename,
typename,
typename>
typename BaseTy,
1381void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1382 Jump jmp, Assembler::Label target_label)
noexcept {
1383 if (this->assembler.label_is_pending(target_label)) {
1384 this->text_writer.ensure_space(6);
1385 auto *target = this->text_writer.cur_ptr();
1387 case Jump::ja: ASMNCF(JA, FE_JMPL, target);
break;
1388 case Jump::jae: ASMNCF(JNC, FE_JMPL, target);
break;
1389 case Jump::jb: ASMNCF(JC, FE_JMPL, target);
break;
1390 case Jump::jbe: ASMNCF(JBE, FE_JMPL, target);
break;
1391 case Jump::je: ASMNCF(JZ, FE_JMPL, target);
break;
1392 case Jump::jg: ASMNCF(JG, FE_JMPL, target);
break;
1393 case Jump::jge: ASMNCF(JGE, FE_JMPL, target);
break;
1394 case Jump::jl: ASMNCF(JL, FE_JMPL, target);
break;
1395 case Jump::jle: ASMNCF(JLE, FE_JMPL, target);
break;
1396 case Jump::jmp: ASMNCF(JMP, FE_JMPL, target);
break;
1397 case Jump::jne: ASMNCF(JNZ, FE_JMPL, target);
break;
1398 case Jump::jno: ASMNCF(JNO, FE_JMPL, target);
break;
1399 case Jump::jo: ASMNCF(JO, FE_JMPL, target);
break;
1400 case Jump::js: ASMNCF(JS, FE_JMPL, target);
break;
1401 case Jump::jns: ASMNCF(JNS, FE_JMPL, target);
break;
1402 case Jump::jp: ASMNCF(JP, FE_JMPL, target);
break;
1403 case Jump::jnp: ASMNCF(JNP, FE_JMPL, target);
break;
1406 this->assembler.add_unresolved_entry(
1408 this->text_writer.get_sec_ref(),
1409 this->text_writer.offset() - 4,
1410 Assembler::UnresolvedEntryKind::JMP_OR_MEM_DISP);
1412 this->text_writer.ensure_space(6);
1413 auto *target = this->text_writer.begin_ptr() +
1414 this->assembler.label_offset(target_label);
1416 case Jump::ja: ASMNC(JA, target);
break;
1417 case Jump::jae: ASMNC(JNC, target);
break;
1418 case Jump::jb: ASMNC(JC, target);
break;
1419 case Jump::jbe: ASMNC(JBE, target);
break;
1420 case Jump::je: ASMNC(JZ, target);
break;
1421 case Jump::jg: ASMNC(JG, target);
break;
1422 case Jump::jge: ASMNC(JGE, target);
break;
1423 case Jump::jl: ASMNC(JL, target);
break;
1424 case Jump::jle: ASMNC(JLE, target);
break;
1425 case Jump::jmp: ASMNC(JMP, target);
break;
1426 case Jump::jne: ASMNC(JNZ, target);
break;
1427 case Jump::jno: ASMNC(JNO, target);
break;
1428 case Jump::jo: ASMNC(JO, target);
break;
1429 case Jump::js: ASMNC(JS, target);
break;
1430 case Jump::jns: ASMNC(JNS, target);
break;
1431 case Jump::jp: ASMNC(JP, target);
break;
1432 case Jump::jnp: ASMNC(JNP, target);
break;
1437template <IRAdaptor Adaptor,
1439 template <
typename,
typename,
typename>
class BaseTy,
1441void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1442 Jump jmp, AsmReg dst)
noexcept {
1443 ASM(MOV32ri, dst, 0);
1445 case Jump::ja: ASM(SETA8r, dst);
break;
1446 case Jump::jae: ASM(SETNC8r, dst);
break;
1447 case Jump::jb: ASM(SETC8r, dst);
break;
1448 case Jump::jbe: ASM(SETBE8r, dst);
break;
1449 case Jump::je: ASM(SETZ8r, dst);
break;
1450 case Jump::jg: ASM(SETG8r, dst);
break;
1451 case Jump::jge: ASM(SETGE8r, dst);
break;
1452 case Jump::jl: ASM(SETL8r, dst);
break;
1453 case Jump::jle: ASM(SETLE8r, dst);
break;
1454 case Jump::jmp: ASM(MOV32ri, dst, 1);
break;
1455 case Jump::jne: ASM(SETNZ8r, dst);
break;
1456 case Jump::jno: ASM(SETNO8r, dst);
break;
1457 case Jump::jo: ASM(SETO8r, dst);
break;
1458 case Jump::js: ASM(SETS8r, dst);
break;
1459 case Jump::jns: ASM(SETNS8r, dst);
break;
1460 case Jump::jp: ASM(SETP8r, dst);
break;
1461 case Jump::jnp: ASM(SETNP8r, dst);
break;
1465template <IRAdaptor Adaptor,
1467 template <
typename,
typename,
typename>
class BaseTy,
1469void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1470 Jump jmp, AsmReg dst)
noexcept {
1472 generate_raw_set(jmp, dst);
1476template <IRAdaptor Adaptor,
1478 template <
typename,
typename,
typename>
class BaseTy,
1480void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1481 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to)
noexcept {
1482 assert(from < to && to <= 64);
1485 case 8: ASM(MOVZXr32r8, dst, src);
break;
1486 case 16: ASM(MOVZXr32r16, dst, src);
break;
1487 case 32: ASM(MOV32rr, dst, src);
break;
1491 ASM(MOV32rr, dst, src);
1493 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1494 }
else if (dst != src) {
1495 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1496 ASM(AND64rr, dst, src);
1498 ScratchReg tmp{
this};
1499 AsmReg tmp_reg = tmp.alloc_gp();
1500 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1501 ASM(AND64rr, dst, tmp_reg);
1504 }
else if (to <= 32) {
1506 case 8: ASM(MOVSXr32r8, dst, src);
break;
1507 case 16: ASM(MOVSXr32r16, dst, src);
break;
1510 ASM(MOV32rr, dst, src);
1512 ASM(SHL32ri, dst, 32 - from);
1513 ASM(SAR32ri, dst, 32 - from);
1517 case 8: ASM(MOVSXr64r8, dst, src);
break;
1518 case 16: ASM(MOVSXr64r16, dst, src);
break;
1519 case 32: ASM(MOVSXr64r32, dst, src);
break;
1522 ASM(MOV64rr, dst, src);
1524 ASM(SHL64ri, dst, 64 - from);
1525 ASM(SAR64ri, dst, 64 - from);
1530template <IRAdaptor Adaptor,
1532 template <
typename,
typename,
typename>
class BaseTy,
1534void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1535 set_stack_used() noexcept {
1536 if (stack_adjust_off == 0) {
1537 stack_adjust_off = this->compiler.text_writer.offset();
1539 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1540 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1544template <IRAdaptor Adaptor,
1546 template <
typename,
typename,
typename>
class BaseTy,
1548void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1549 ValuePart &vp, CCAssignment &cca)
noexcept {
1550 AsmReg ptr = vp.load_to_reg(&this->compiler);
1551 ScratchReg scratch{&this->compiler};
1552 AsmReg tmp = scratch.alloc_gp();
1554 auto size = cca.byval_size;
1558 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1559 ASMC(&this->compiler,
1561 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1567 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1568 ASMC(&this->compiler,
1570 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1576 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1577 ASMC(&this->compiler,
1579 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1585 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1586 ASMC(&this->compiler,
1588 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1593template <IRAdaptor Adaptor,
1595 template <
typename,
typename,
typename>
class BaseTy,
1597void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1598 ValuePart &vp, CCAssignment &cca)
noexcept {
1601 auto reg = vp.load_to_reg(&this->compiler);
1602 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1605 ASMC(&this->compiler,
1607 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1611 ASMC(&this->compiler,
1613 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1617 ASMC(&this->compiler,
1619 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1623 ASMC(&this->compiler,
1625 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1628 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1631 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1634 ASMC(&this->compiler,
1636 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1640 ASMC(&this->compiler,
1642 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1646 ASMC(&this->compiler,
1648 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1651 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1656template <IRAdaptor Adaptor,
1658 template <
typename,
typename,
typename>
class BaseTy,
1660void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1661 std::variant<typename Assembler::SymRef, ValuePart> &&target)
noexcept {
1662 if (this->assigner.is_vararg()) {
1663 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1664 this->compiler.evict_reg(Reg{AsmReg::AX});
1666 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1667 Config::FP_BANK, 0);
1668 unsigned xmm_cnt = 8;
1669 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1670 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1672 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1676 if (stack_adjust_off != 0) {
1677 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1678 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1679 memcpy(inst_ptr + 3, &sub,
sizeof(u32));
1681 assert(this->assigner.get_stack_size() == 0);
1684 if (
auto *sym = std::get_if<typename Assembler::SymRef>(&target)) {
1685 this->compiler.text_writer.ensure_space(16);
1686 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1687 this->compiler.reloc_text(
1688 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1690 ValuePart &tvp = std::get<ValuePart>(target);
1691 if (AsmReg reg = tvp.cur_reg_unlocked(); reg.valid()) {
1692 ASMC(&this->compiler, CALLr, reg);
1693 }
else if (tvp.has_assignment() && tvp.assignment().stack_valid()) {
1694 auto off = tvp.assignment().frame_off();
1695 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1697 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1698 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1699 ASMC(&this->compiler, CALLr, reg);
1701 tvp.reset(&this->compiler);
1704 if (stack_adjust_off != 0) {
1705 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1709template <IRAdaptor Adaptor,
1711 template <
typename,
typename,
typename>
typename BaseTy,
1713void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1714 std::variant<Assembler::SymRef, ValuePart> &&target,
1715 std::span<CallArg> arguments,
1716 typename Base::ValueRef *result,
1717 const bool variable_args) {
1718 CCAssignerSysV assigner{variable_args};
1719 CallBuilder cb{*derived(), assigner};
1720 for (
auto &arg : arguments) {
1721 cb.add_arg(std::move(arg));
1723 cb.call(std::move(target));
1725 cb.add_ret(*result);
1729template <IRAdaptor Adaptor,
1731 template <
typename,
typename,
typename>
typename BaseTy,
1733CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1734 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1735 Assembler::SymRef sym, TLSModel model)
noexcept {
1738 case TLSModel::GlobalDynamic: {
1741 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1742 for (
auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1743 this->evict_reg(Reg{reg});
1745 ScratchReg arg{
this};
1746 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1750 this->text_writer.ensure_space(0x10);
1751 *this->text_writer.cur_ptr()++ = 0x66;
1752 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1753 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1754 *this->text_writer.cur_ptr()++ = 0x66;
1755 *this->text_writer.cur_ptr()++ = 0x66;
1756 *this->text_writer.cur_ptr()++ = 0x48;
1757 ASMNC(CALL, this->text_writer.cur_ptr());
1758 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1759 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1760 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
1762 this->reloc_text(this->sym_tls_get_addr,
1764 this->text_writer.offset() - 4,
1768 ScratchReg res{
this};
1769 res.alloc_specific(AsmReg::AX);