6#include "AssemblerElfX64.hpp"
7#include "tpde/CompilerBase.hpp"
8#include "tpde/ValLocalIdx.hpp"
9#include "tpde/ValueAssignment.hpp"
10#include "tpde/base.hpp"
19#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
20 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
25#define ASM_FULL(compiler, reserve, op, ...) \
26 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
28#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
29#define ASMC(compiler, op, ...) \
30 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
31#define ASMF(op, flag, ...) \
32 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
33#define ASMNCF(op, flag, ...) \
34 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
35#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
77 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
79 constexpr AsmReg(
const REG
id) noexcept : Reg((u8)
id) {}
81 constexpr AsmReg(
const Reg base) noexcept : Reg(base) {}
83 constexpr explicit AsmReg(
const u8
id) noexcept : Reg(
id) {
84 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
87 constexpr explicit AsmReg(
const u64
id) noexcept : Reg(
id) {
88 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
91 constexpr operator FeRegGP() const noexcept {
92 assert(reg_id <= R15);
93 return FeRegGP{reg_id};
96 operator FeRegGPLH() const noexcept {
97 assert(reg_id <= R15);
98 return FeRegGP{reg_id};
101 constexpr operator FeRegXMM() const noexcept {
102 assert(reg_id >= XMM0 && reg_id <= XMM15);
103 return FeRegXMM{
static_cast<u8
>(reg_id & 0x1F)};
108 create_bitmask(
const std::initializer_list<AsmReg::REG> regs) {
110 for (
const auto reg : regs) {
117constexpr static u64 create_bitmask(
const std::array<AsmReg, N> regs) {
119 for (
const auto reg : regs) {
120 set |= 1ull << reg.id();
125class CCAssignerSysV :
public CCAssigner {
127 static constexpr CCInfo Info{
129 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
130 .callee_saved_regs = create_bitmask({
137 .arg_regs = create_bitmask({
156 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
158 unsigned must_assign_stack = 0;
160 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
163 CCAssignerSysV(
bool vararg =
false) noexcept
164 : CCAssigner(Info), vararg(vararg) {}
166 void reset() noexcept
override {
167 gp_cnt = xmm_cnt = stack = 0;
168 must_assign_stack = 0;
170 ret_gp_cnt = ret_xmm_cnt = 0;
173 void assign_arg(CCAssignment &arg)
noexcept override {
175 stack = util::align_up(stack, arg.byval_align < 8 ? 8 : arg.byval_align);
176 arg.stack_off = stack;
177 stack += arg.byval_size;
181 if (arg.bank == RegBank{0}) {
182 static constexpr std::array<AsmReg, 6> gp_arg_regs{
190 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
191 arg.reg = gp_arg_regs[gp_cnt];
196 must_assign_stack = arg.consecutive + 1;
197 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
198 arg.stack_off = stack;
202 if (!must_assign_stack && xmm_cnt < 8) {
203 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
208 must_assign_stack = arg.consecutive + 1;
209 u32 size = util::align_up(arg.size, 8);
210 stack = util::align_up(stack, size);
211 arg.stack_off = stack;
216 if (must_assign_stack > 0) {
217 must_assign_stack -= 1;
221 u32 get_stack_size() noexcept
override {
return stack; }
223 bool is_vararg() const noexcept
override {
return vararg; }
225 void assign_ret(CCAssignment &arg)
noexcept override {
226 assert(!arg.byval && !arg.sret);
227 if (arg.bank == RegBank{0}) {
228 if (ret_gp_cnt + arg.consecutive < 2) {
229 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
235 if (ret_xmm_cnt + arg.consecutive < 2) {
236 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
245struct PlatformConfig : CompilerConfigDefault {
246 using Assembler = AssemblerElfX64;
247 using AsmReg = tpde::x64::AsmReg;
248 using DefaultCCAssigner = CCAssignerSysV;
250 static constexpr RegBank GP_BANK{0};
251 static constexpr RegBank FP_BANK{1};
252 static constexpr bool FRAME_INDEXING_NEGATIVE =
true;
253 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
254 static constexpr u32 NUM_BANKS = 2;
258template <
typename T,
typename Config>
259concept Compiler = tpde::Compiler<T, Config> &&
requires(T a) {
261 a.arg_is_int128(std::declval<typename T::IRValueRef>())
262 } -> std::convertible_to<bool>;
265 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
266 } -> std::convertible_to<bool>;
270template <IRAdaptor Adaptor,
272 template <
typename,
typename,
typename>
typename BaseTy =
274 typename Config = PlatformConfig>
275struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
276 using Base = BaseTy<Adaptor, Derived, Config>;
278 using IRValueRef =
typename Base::IRValueRef;
279 using IRBlockRef =
typename Base::IRBlockRef;
280 using IRFuncRef =
typename Base::IRFuncRef;
282 using ScratchReg =
typename Base::ScratchReg;
283 using ValuePartRef =
typename Base::ValuePartRef;
284 using ValuePart =
typename Base::ValuePart;
285 using GenericValuePart =
typename Base::GenericValuePart;
287 using Assembler =
typename PlatformConfig::Assembler;
288 using RegisterFile =
typename Base::RegisterFile;
290 using CallArg =
typename Base::CallArg;
297 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
300 enum CPU_FEATURES : u32 {
302 CPU_CMPXCHG16B = (1 << 0),
303 CPU_POPCNT = (1 << 1),
305 CPU_SSSE3 = (1 << 3),
306 CPU_SSE4_1 = (1 << 4),
307 CPU_SSE4_2 = (1 << 5),
312 CPU_F16C = (1 << 10),
314 CPU_LZCNT = (1 << 12),
315 CPU_MOVBE = (1 << 13),
316 CPU_AVX512F = (1 << 14),
317 CPU_AVX512BW = (1 << 15),
318 CPU_AVX512CD = (1 << 16),
319 CPU_AVX512DQ = (1 << 17),
320 CPU_AVX512VL = (1 << 18),
322 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
323 CPU_SSE4_1 | CPU_SSE4_2,
324 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
325 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
326 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
330 CPU_FEATURES cpu_feats = CPU_BASELINE;
339 u64 fixed_assignment_nonallocatable_mask =
340 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
341 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
342 func_reg_restore_alloc = 0u;
344 u32 frame_size_setup_offset = 0u;
347 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
348 u32 reg_save_frame_off = 0;
349 u32 var_arg_stack_off = 0;
350 util::SmallVector<u32, 8> func_ret_offs = {};
353 Assembler::SymRef sym_tls_get_addr;
355 class CallBuilder :
public Base::template CallBuilderBase<CallBuilder> {
356 u32 stack_adjust_off = 0;
358 void set_stack_used() noexcept;
361 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
362 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
364 void add_arg_byval(ValuePart &vp, CCAssignment &cca)
noexcept;
365 void add_arg_stack(ValuePart &vp, CCAssignment &cca)
noexcept;
367 std::variant<typename Assembler::SymRef, ValuePart> &&target)
noexcept;
368 void reset_stack() noexcept;
372 explicit CompilerX64(Adaptor *adaptor,
373 const CPU_FEATURES cpu_features = CPU_BASELINE)
374 : Base{adaptor}, cpu_feats(cpu_features) {
375 static_assert(std::is_base_of_v<CompilerX64, Derived>);
376 static_assert(concepts::Compiler<Derived, PlatformConfig>);
379 template <
typename... Args>
380 auto asm_helper(
unsigned (*enc_fn)(u8 *,
int, Args...)) {
382 CompilerX64 *compiler;
384 void encode(
unsigned reserve,
int flags, Args... args) {
386 compiler->text_writer.ensure_space(reserve);
388 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
390 compiler->text_writer.cur_ptr() += n;
393 return Helper{
this, enc_fn};
396 void start_func(u32 func_idx)
noexcept;
398 void gen_func_prolog_and_args(CCAssigner *)
noexcept;
400 void finish_func(u32 func_idx)
noexcept;
402 void reset() noexcept;
406 void gen_func_epilog() noexcept;
409 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
411 void load_from_stack(AsmReg dst,
414 bool sign_extend = false) noexcept;
416 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
418 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
420 GenericValuePart val_spill_slot(ValuePart &val_ref) noexcept {
421 const auto ap = val_ref.assignment();
422 assert(ap.stack_valid() && !ap.variable_ref());
423 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
426 AsmReg gval_expr_as_reg(GenericValuePart &gv)
noexcept;
428 void materialize_constant(
const u64 *data,
431 AsmReg dst)
noexcept;
433 AsmReg select_fixed_assignment_reg(RegBank bank, IRValueRef)
noexcept;
455 Jump invert_jump(Jump jmp)
noexcept;
456 Jump swap_jump(Jump jmp)
noexcept;
458 void generate_branch_to_block(Jump jmp,
461 bool last_inst)
noexcept;
463 void generate_raw_jump(Jump jmp, Assembler::Label target)
noexcept;
466 void generate_raw_set(Jump cc, AsmReg dst)
noexcept;
468 void generate_raw_mask(Jump cc, AsmReg dst)
noexcept;
470 void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src,
bool is_64)
noexcept;
472 void generate_raw_intext(
473 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to)
noexcept;
485 void generate_call(std::variant<Assembler::SymRef, ValuePart> &&target,
486 std::span<CallArg> arguments,
487 typename Base::ValueRef *result,
488 bool variable_args =
false);
492 ScratchReg tls_get_addr(Assembler::SymRef sym, TLSModel model)
noexcept;
494 bool has_cpu_feats(CPU_FEATURES feats)
const noexcept {
495 return ((cpu_feats & feats) == feats);
499template <IRAdaptor Adaptor,
501 template <
typename,
typename,
typename>
class BaseTy,
503void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
504 const u32 )
noexcept {
505 this->text_writer.align(16);
506 this->assembler.except_begin_func();
509template <IRAdaptor Adaptor,
511 template <
typename,
typename,
typename>
typename BaseTy,
513void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
514 CCAssigner *cc_assigner)
noexcept {
530 func_ret_offs.clear();
531 func_start_off = this->text_writer.offset();
532 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
534 const CCInfo &cc_info = cc_assigner->get_ccinfo();
537 ASM(MOV64rr, FE_BP, FE_SP);
539 func_reg_save_off = this->text_writer.offset();
541 auto csr = cc_info.callee_saved_regs;
542 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
543 "non-gp callee-saved registers not implemented");
545 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
546 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
548 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
549 this->stack.frame_size = 8 * (csr_logp + csr_higp);
551 this->text_writer.ensure_space(reg_save_size);
552 this->text_writer.cur_ptr() += reg_save_size;
553 func_reg_save_alloc = reg_save_size;
555 func_reg_restore_alloc = reg_save_size;
560 frame_size_setup_offset = this->text_writer.offset();
561 ASM(SUB64ri, FE_SP, 0x7FFF'FFFF);
563 assert((this->text_writer.offset() - frame_size_setup_offset) == 7);
566 if (this->adaptor->cur_is_vararg()) {
567 this->stack.frame_size += 6 * 8 + 8 * 16;
568 reg_save_frame_off = this->stack.frame_size;
569 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
570 ASM(MOV64mr, mem, FE_DI);
572 ASM(MOV64mr, mem, FE_SI);
574 ASM(MOV64mr, mem, FE_DX);
576 ASM(MOV64mr, mem, FE_CX);
578 ASM(MOV64mr, mem, FE_R8);
580 ASM(MOV64mr, mem, FE_R9);
581 auto skip_fp = this->assembler.label_create();
582 ASM(TEST8rr, FE_AX, FE_AX);
583 generate_raw_jump(Jump::je, skip_fp);
585 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
587 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
589 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
591 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
593 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
595 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
597 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
599 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
600 this->label_place(skip_fp);
604 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
605 "argument registers must also be allocatable");
606 this->register_file.allocatable &= ~cc_info.arg_regs;
609 for (
const IRValueRef arg : this->adaptor->cur_args()) {
613 [&](ValuePart &&vp, CCAssignment cca) -> std::optional<i32> {
614 cca.bank = vp.bank();
615 cca.size = vp.part_size();
617 cc_assigner->assign_arg(cca);
619 if (cca.reg.valid()) [[likely]] {
620 vp.set_value_reg(
this, cca.reg);
624 this->register_file.allocatable |= u64{1} << cca.reg.id();
635 return 0x10 + cca.stack_off;
641 AsmReg dst = vp.alloc_reg(
this);
642 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
650 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
652 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
653 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
654 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
655 this->scalar_arg_count = std::popcount(gp_regs);
656 this->vec_arg_count = std::popcount(xmm_regs);
657 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
660 this->register_file.allocatable |= cc_info.arg_regs;
663template <IRAdaptor Adaptor,
665 template <
typename,
typename,
typename>
typename BaseTy,
667void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
668 u32 func_idx)
noexcept {
670 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
672 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
673 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
674 this->assembler.eh_write_inst(
675 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
677 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
678 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
679 dwarf::x64::DW_reg_rbp);
682 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
683 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
685 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
686 auto csr =
derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
687 u64 saved_regs = this->register_file.clobbered & csr;
688 u32 num_saved_regs = 0u;
689 for (
auto reg : util::BitSetIterator{saved_regs}) {
690 assert(reg <= AsmReg::R15);
692 fe64_PUSHr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
698 static const u8 gpreg_to_dwarf[] = {
699 dwarf::x64::DW_reg_rax,
700 dwarf::x64::DW_reg_rcx,
701 dwarf::x64::DW_reg_rdx,
702 dwarf::x64::DW_reg_rbx,
703 dwarf::x64::DW_reg_rsp,
704 dwarf::x64::DW_reg_rbp,
705 dwarf::x64::DW_reg_rsi,
706 dwarf::x64::DW_reg_rdi,
707 dwarf::x64::DW_reg_r8,
708 dwarf::x64::DW_reg_r9,
709 dwarf::x64::DW_reg_r10,
710 dwarf::x64::DW_reg_r11,
711 dwarf::x64::DW_reg_r12,
712 dwarf::x64::DW_reg_r13,
713 dwarf::x64::DW_reg_r14,
714 dwarf::x64::DW_reg_r15,
716 u8 dwarf_reg = gpreg_to_dwarf[reg];
717 auto cfa_off = num_saved_regs + 2;
718 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
722 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
723 assert(prologue_size < 0x44);
724 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
725 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
729 const auto final_frame_size =
730 util::align_up(this->stack.frame_size, 16) - num_saved_regs * 8;
731 *
reinterpret_cast<u32 *
>(this->text_writer.begin_ptr() +
732 frame_size_setup_offset + 3) = final_frame_size;
735 assert(fd_decode(this->text_writer.begin_ptr() + frame_size_setup_offset,
740 assert(FD_TYPE(&instr) == FDI_SUB);
741 assert(FD_OP_TYPE(&instr, 0) == FD_OT_REG);
742 assert(FD_OP_TYPE(&instr, 1) == FD_OT_IMM);
743 assert(FD_OP_SIZE(&instr, 0) == 8);
744 assert(FD_OP_SIZE(&instr, 1) == 8);
745 assert(FD_OP_IMM(&instr, 1) == final_frame_size);
749 const auto reg_save_end =
750 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
751 assert(reg_save_end >= write_ptr);
752 const u32 nop_len = reg_save_end - write_ptr;
754 fe64_NOP(write_ptr, nop_len);
757 auto func_sym = this->func_syms[func_idx];
758 auto func_sec = this->text_writer.get_sec_ref();
759 if (func_ret_offs.empty()) {
761 auto func_size = this->text_writer.offset() - func_start_off;
762 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
763 this->assembler.eh_end_fde(fde_off, func_sym);
764 this->assembler.except_encode_func(func_sym);
768 auto *text_data = this->text_writer.begin_ptr();
769 u32 first_ret_off = func_ret_offs[0];
771 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc;
772 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
774 write_ptr = text_data + first_ret_off;
775 const auto ret_start = write_ptr;
776 if (this->adaptor->cur_has_dynamic_alloca()) {
777 if (num_saved_regs == 0) {
778 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
781 fe64_LEA64rm(write_ptr,
784 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
787 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, final_frame_size);
789 for (
auto reg : util::BitSetIterator<true>{saved_regs}) {
790 assert(reg <= AsmReg::R15);
792 fe64_POPr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
794 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
795 write_ptr += fe64_RET(write_ptr, 0);
796 ret_size = write_ptr - ret_start;
797 assert(ret_size <= epilogue_size &&
"function epilogue too long");
800 if (epilogue_size > ret_size) {
801 fe64_NOP(write_ptr, epilogue_size - ret_size);
802 if (first_ret_off == func_end_ret_off) {
803 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
808 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
810 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
811 if (func_ret_offs[i] == func_end_ret_off) {
812 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
819 auto func_size = this->text_writer.offset() - func_start_off;
820 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
821 this->assembler.eh_end_fde(fde_off, func_sym);
822 this->assembler.except_encode_func(func_sym);
825template <IRAdaptor Adaptor,
827 template <
typename,
typename,
typename>
typename BaseTy,
829void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
830 func_ret_offs.clear();
831 sym_tls_get_addr = {};
835template <IRAdaptor Adaptor,
837 template <
typename,
typename,
typename>
typename BaseTy,
839void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
853 func_ret_offs.push_back(this->text_writer.offset());
861 func_reg_restore_alloc;
863 this->text_writer.ensure_space(epilogue_size);
864 this->text_writer.cur_ptr() += epilogue_size;
867template <IRAdaptor Adaptor,
869 template <
typename,
typename,
typename>
typename BaseTy,
871void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
872 const AsmReg reg,
const i32 frame_off,
const u32 size)
noexcept {
873 this->text_writer.ensure_space(16);
874 assert(frame_off < 0);
875 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
876 if (reg.id() <= AsmReg::R15) {
878 case 1: ASMNC(MOV8mr, mem, reg);
break;
879 case 2: ASMNC(MOV16mr, mem, reg);
break;
880 case 4: ASMNC(MOV32mr, mem, reg);
break;
881 case 8: ASMNC(MOV64mr, mem, reg);
break;
882 default: TPDE_UNREACHABLE(
"invalid spill size");
888 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg);
break;
889 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg);
break;
890 case 16: ASMNC(SSE_MOVAPDmr, mem, reg);
break;
891 default: TPDE_UNREACHABLE(
"invalid spill size");
895template <IRAdaptor Adaptor,
897 template <
typename,
typename,
typename>
typename BaseTy,
899void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
903 const bool sign_extend)
noexcept {
904 this->text_writer.ensure_space(16);
905 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
907 if (dst.id() <= AsmReg::R15) {
910 case 1: ASMNC(MOVZXr32m8, dst, mem);
break;
911 case 2: ASMNC(MOVZXr32m16, dst, mem);
break;
912 case 4: ASMNC(MOV32rm, dst, mem);
break;
913 case 8: ASMNC(MOV64rm, dst, mem);
break;
914 default: TPDE_UNREACHABLE(
"invalid spill size");
918 case 1: ASMNC(MOVSXr64m8, dst, mem);
break;
919 case 2: ASMNC(MOVSXr64m16, dst, mem);
break;
920 case 4: ASMNC(MOVSXr64m32, dst, mem);
break;
921 case 8: ASMNC(MOV64rm, dst, mem);
break;
922 default: TPDE_UNREACHABLE(
"invalid spill size");
928 assert(!sign_extend);
931 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem);
break;
932 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem);
break;
933 case 16: ASMNC(SSE_MOVAPDrm, dst, mem);
break;
934 default: TPDE_UNREACHABLE(
"invalid spill size");
938template <IRAdaptor Adaptor,
940 template <
typename,
typename,
typename>
typename BaseTy,
942void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
943 const AsmReg dst,
const AssignmentPartRef ap)
noexcept {
944 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
947template <IRAdaptor Adaptor,
949 template <
typename,
typename,
typename>
typename BaseTy,
951void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
952 const AsmReg dst,
const AsmReg src,
const u32 size)
noexcept {
955 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
957 ASM(MOV64rr, dst, src);
959 ASM(MOV32rr, dst, src);
961 }
else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
963 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
964 assert(has_cpu_feats(CPU_AVX512F));
965 ASM(VMOVAPD128rr, dst, src);
967 ASM(SSE_MOVAPDrr, dst, src);
969 }
else if (size <= 32) {
970 assert(has_cpu_feats(CPU_AVX));
971 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
972 has_cpu_feats(CPU_AVX512F));
973 ASM(VMOVAPD256rr, dst, src);
976 assert(has_cpu_feats(CPU_AVX512F));
977 ASM(VMOVAPD512rr, dst, src);
979 }
else if (dst.id() <= AsmReg::R15) {
981 assert(src.id() >= AsmReg::XMM0);
983 if (src.id() > AsmReg::XMM15) {
984 assert(has_cpu_feats(CPU_AVX512F));
986 ASM(VMOVD_X2Grr, dst, src);
988 ASM(VMOVQ_X2Grr, dst, src);
992 ASM(SSE_MOVD_X2Grr, dst, src);
994 ASM(SSE_MOVQ_X2Grr, dst, src);
999 assert(src.id() <= AsmReg::R15);
1000 assert(dst.id() >= AsmReg::XMM0);
1002 if (dst.id() > AsmReg::XMM15) {
1003 assert(has_cpu_feats(CPU_AVX512F));
1005 ASM(VMOVD_G2Xrr, dst, src);
1007 ASM(VMOVQ_G2Xrr, dst, src);
1011 ASM(SSE_MOVD_G2Xrr, dst, src);
1013 ASM(SSE_MOVQ_G2Xrr, dst, src);
1019template <IRAdaptor Adaptor,
1021 template <
typename,
typename,
typename>
typename BaseTy,
1023AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1024 GenericValuePart &gv)
noexcept {
1025 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1027 ScratchReg scratch{
derived()};
1028 bool disp32 = i32(expr.disp) == expr.disp;
1029 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1030 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1031 if (std::holds_alternative<ScratchReg>(expr.base)) {
1032 scratch = std::move(std::get<ScratchReg>(expr.base));
1033 }
else if (std::holds_alternative<ScratchReg>(expr.index)) {
1034 scratch = std::move(std::get<ScratchReg>(expr.index));
1036 (void)scratch.alloc_gp();
1038 auto dst = scratch.cur_reg();
1040 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1042 if (base.valid() && disp32) {
1043 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1045 }
else if (base.valid()) {
1046 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1047 }
else if (disp32) {
1048 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1050 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1053 u64 scale = expr.scale;
1055 base = AsmReg::make_invalid();
1059 ScratchReg idx_scratch{
derived()};
1061 AsmReg idx_tmp = dst;
1062 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1065 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1066 }
else if (dst == base) {
1067 idx_tmp = idx_scratch.alloc_gp();
1070 if ((scale & (scale - 1)) == 0) {
1071 if (idx_tmp != idx) {
1072 ASM(MOV64rr, idx_tmp, idx);
1074 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1076 if (i32(scale) == i64(scale)) {
1077 ASM(IMUL64rri, idx_tmp, idx, scale);
1079 ScratchReg scratch2{
derived()};
1080 auto tmp2 = scratch2.alloc_gp();
1081 ASM(MOV64ri, tmp2, scale);
1082 if (idx_tmp != idx) {
1083 ASM(MOV64rr, idx_tmp, idx);
1085 ASM(IMUL64rr, idx_tmp, tmp2);
1089 if (disp32 || (idx_tmp != dst && base != dst)) {
1090 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1092 }
else if (dst == base) {
1093 ASM(ADD64rr, dst, idx_tmp);
1095 ASM(ADD64rr, dst, base);
1099 }
else if (base.valid()) {
1100 if (expr.disp && disp32) {
1101 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1103 }
else if (dst != base) {
1104 ASM(MOV64rr, dst, base);
1108 ScratchReg scratch2{
derived()};
1109 auto tmp2 = scratch2.alloc_gp();
1110 ASM(MOV64ri, tmp2, expr.disp);
1111 ASM(ADD64rr, dst, tmp2);
1113 gv.state = std::move(scratch);
1117template <IRAdaptor Adaptor,
1119 template <
typename,
typename,
typename>
typename BaseTy,
1121void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1122 const u64 *data,
const RegBank bank,
const u32 size, AsmReg dst)
noexcept {
1123 const auto const_u64 = data[0];
1124 if (bank == Config::GP_BANK) {
1126 if (const_u64 == 0) {
1130 ASM(MOV32ri, dst, 0);
1135 ASM(MOV32ri, dst, const_u64);
1137 ASM(MOV64ri, dst, const_u64);
1142 assert(bank == Config::FP_BANK);
1143 const auto high_u64 = size <= 8 ? 0 : data[1];
1144 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1145 if (has_cpu_feats(CPU_AVX)) {
1146 ASM(VPXOR128rrr, dst, dst, dst);
1148 ASM(SSE_PXORrr, dst, dst);
1152 const u64 ones = -u64{1};
1153 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1154 if (has_cpu_feats(CPU_AVX)) {
1155 ASM(VPCMPEQB128rrr, dst, dst, dst);
1157 ASM(SSE_PCMPEQBrr, dst, dst);
1166 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1168 this->register_file.mark_clobbered(tmp);
1169 materialize_constant(data, Config::GP_BANK, size, tmp);
1171 if (has_cpu_feats(CPU_AVX)) {
1172 ASM(VMOVD_G2Xrr, dst, tmp);
1174 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1177 if (has_cpu_feats(CPU_AVX)) {
1178 ASM(VMOVQ_G2Xrr, dst, tmp);
1180 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1189 auto alloc_size = util::align_up(size, 8);
1190 std::span<const u8> raw_data{
reinterpret_cast<const u8 *
>(data), alloc_size};
1192 auto rodata = this->assembler.get_data_section(
true,
false);
1193 auto sym = this->assembler.sym_def_data(
1194 rodata,
"", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1196 if (has_cpu_feats(CPU_AVX)) {
1197 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1199 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1201 }
else if (size <= 8) {
1202 if (has_cpu_feats(CPU_AVX)) {
1203 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1205 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1207 }
else if (size <= 16) {
1208 if (has_cpu_feats(CPU_AVX)) {
1209 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1211 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1215 TPDE_FATAL(
"unable to materialize constant");
1218 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1221template <IRAdaptor Adaptor,
1223 template <
typename,
typename,
typename>
typename BaseTy,
1226 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1227 const RegBank bank, IRValueRef)
noexcept {
1228 assert(bank.id() <= Config::NUM_BANKS);
1229 auto reg_mask = this->register_file.bank_regs(bank);
1230 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1232 const auto find_possible_regs = [
this,
1233 reg_mask](
const u64 preferred_regs) -> u64 {
1235 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1236 u64 possible_regs = free_regs & preferred_regs & reg_mask;
1237 if (possible_regs == 0) {
1238 possible_regs = (this->register_file.used & ~this->register_file.fixed) &
1239 preferred_regs & reg_mask;
1241 return possible_regs;
1245 auto csr =
derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1246 if (
derived()->cur_func_may_emit_calls()) {
1248 possible_regs = find_possible_regs(csr);
1252 possible_regs = find_possible_regs(~csr);
1253 if (possible_regs == 0) {
1255 possible_regs = find_possible_regs(csr);
1259 if (possible_regs == 0) {
1260 return AsmReg::make_invalid();
1264 if ((possible_regs & ~this->register_file.used) != 0) {
1265 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1268 for (
const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1269 const auto reg = AsmReg{reg_id};
1271 if (this->register_file.is_fixed(reg)) {
1275 const auto local_idx = this->register_file.reg_local_idx(reg);
1276 const auto part = this->register_file.reg_part(reg);
1278 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1281 auto *assignment = this->val_assignment(local_idx);
1282 auto ap = AssignmentPartRef{assignment, part};
1283 if (ap.modified()) {
1290 return AsmReg::make_invalid();
1293template <IRAdaptor Adaptor,
1295 template <
typename,
typename,
typename>
typename BaseTy,
1297typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1298 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1299 Jump jmp)
noexcept {
1301 case Jump::ja:
return Jump::jbe;
1302 case Jump::jae:
return Jump::jb;
1303 case Jump::jb:
return Jump::jae;
1304 case Jump::jbe:
return Jump::ja;
1305 case Jump::je:
return Jump::jne;
1306 case Jump::jg:
return Jump::jle;
1307 case Jump::jge:
return Jump::jl;
1308 case Jump::jl:
return Jump::jge;
1309 case Jump::jle:
return Jump::jg;
1310 case Jump::jmp:
return Jump::jmp;
1311 case Jump::jne:
return Jump::je;
1312 case Jump::jno:
return Jump::jo;
1313 case Jump::jo:
return Jump::jno;
1314 case Jump::js:
return Jump::jns;
1315 case Jump::jns:
return Jump::js;
1316 case Jump::jp:
return Jump::jnp;
1317 case Jump::jnp:
return Jump::jp;
1318 default: TPDE_UNREACHABLE(
"invalid jump condition");
1322template <IRAdaptor Adaptor,
1324 template <
typename,
typename,
typename>
class BaseTy,
1326typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1327 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1328 Jump jmp)
noexcept {
1330 case Jump::ja:
return Jump::jb;
1331 case Jump::jae:
return Jump::jbe;
1332 case Jump::jb:
return Jump::ja;
1333 case Jump::jbe:
return Jump::jae;
1334 case Jump::je:
return Jump::je;
1335 case Jump::jg:
return Jump::jl;
1336 case Jump::jge:
return Jump::jle;
1337 case Jump::jl:
return Jump::jg;
1338 case Jump::jle:
return Jump::jge;
1339 case Jump::jmp:
return Jump::jmp;
1340 case Jump::jne:
return Jump::jne;
1341 case Jump::jno:
return Jump::jno;
1342 case Jump::jo:
return Jump::jo;
1343 case Jump::js:
return Jump::js;
1344 case Jump::jns:
return Jump::jns;
1345 case Jump::jp:
return Jump::jp;
1346 case Jump::jnp:
return Jump::jnp;
1347 default: TPDE_UNREACHABLE(
"invalid jump condition");
1351template <IRAdaptor Adaptor,
1353 template <
typename,
typename,
typename>
typename BaseTy,
1355void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1358 const bool needs_split,
1359 const bool last_inst)
noexcept {
1360 const auto target_idx = this->analyzer.block_idx(target);
1361 if (!needs_split || jmp == Jump::jmp) {
1362 this->
derived()->move_to_phi_nodes(target_idx);
1364 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1365 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1368 auto tmp_label = this->assembler.label_create();
1369 generate_raw_jump(invert_jump(jmp), tmp_label);
1371 this->
derived()->move_to_phi_nodes(target_idx);
1373 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1375 this->label_place(tmp_label);
1379template <IRAdaptor Adaptor,
1381 template <
typename,
typename,
typename>
typename BaseTy,
1383void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1384 Jump jmp, Assembler::Label target_label)
noexcept {
1385 if (this->assembler.label_is_pending(target_label)) {
1386 this->text_writer.ensure_space(6);
1387 auto *target = this->text_writer.cur_ptr();
1389 case Jump::ja: ASMNCF(JA, FE_JMPL, target);
break;
1390 case Jump::jae: ASMNCF(JNC, FE_JMPL, target);
break;
1391 case Jump::jb: ASMNCF(JC, FE_JMPL, target);
break;
1392 case Jump::jbe: ASMNCF(JBE, FE_JMPL, target);
break;
1393 case Jump::je: ASMNCF(JZ, FE_JMPL, target);
break;
1394 case Jump::jg: ASMNCF(JG, FE_JMPL, target);
break;
1395 case Jump::jge: ASMNCF(JGE, FE_JMPL, target);
break;
1396 case Jump::jl: ASMNCF(JL, FE_JMPL, target);
break;
1397 case Jump::jle: ASMNCF(JLE, FE_JMPL, target);
break;
1398 case Jump::jmp: ASMNCF(JMP, FE_JMPL, target);
break;
1399 case Jump::jne: ASMNCF(JNZ, FE_JMPL, target);
break;
1400 case Jump::jno: ASMNCF(JNO, FE_JMPL, target);
break;
1401 case Jump::jo: ASMNCF(JO, FE_JMPL, target);
break;
1402 case Jump::js: ASMNCF(JS, FE_JMPL, target);
break;
1403 case Jump::jns: ASMNCF(JNS, FE_JMPL, target);
break;
1404 case Jump::jp: ASMNCF(JP, FE_JMPL, target);
break;
1405 case Jump::jnp: ASMNCF(JNP, FE_JMPL, target);
break;
1408 this->assembler.add_unresolved_entry(
1410 this->text_writer.get_sec_ref(),
1411 this->text_writer.offset() - 4,
1412 Assembler::UnresolvedEntryKind::JMP_OR_MEM_DISP);
1414 this->text_writer.ensure_space(6);
1415 auto *target = this->text_writer.begin_ptr() +
1416 this->assembler.label_offset(target_label);
1418 case Jump::ja: ASMNC(JA, target);
break;
1419 case Jump::jae: ASMNC(JNC, target);
break;
1420 case Jump::jb: ASMNC(JC, target);
break;
1421 case Jump::jbe: ASMNC(JBE, target);
break;
1422 case Jump::je: ASMNC(JZ, target);
break;
1423 case Jump::jg: ASMNC(JG, target);
break;
1424 case Jump::jge: ASMNC(JGE, target);
break;
1425 case Jump::jl: ASMNC(JL, target);
break;
1426 case Jump::jle: ASMNC(JLE, target);
break;
1427 case Jump::jmp: ASMNC(JMP, target);
break;
1428 case Jump::jne: ASMNC(JNZ, target);
break;
1429 case Jump::jno: ASMNC(JNO, target);
break;
1430 case Jump::jo: ASMNC(JO, target);
break;
1431 case Jump::js: ASMNC(JS, target);
break;
1432 case Jump::jns: ASMNC(JNS, target);
break;
1433 case Jump::jp: ASMNC(JP, target);
break;
1434 case Jump::jnp: ASMNC(JNP, target);
break;
1439template <IRAdaptor Adaptor,
1441 template <
typename,
typename,
typename>
class BaseTy,
1443void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1444 Jump cc, AsmReg dst)
noexcept {
1445 ASM(MOV32ri, dst, 0);
1447 case Jump::ja: ASM(SETA8r, dst);
break;
1448 case Jump::jae: ASM(SETNC8r, dst);
break;
1449 case Jump::jb: ASM(SETC8r, dst);
break;
1450 case Jump::jbe: ASM(SETBE8r, dst);
break;
1451 case Jump::je: ASM(SETZ8r, dst);
break;
1452 case Jump::jg: ASM(SETG8r, dst);
break;
1453 case Jump::jge: ASM(SETGE8r, dst);
break;
1454 case Jump::jl: ASM(SETL8r, dst);
break;
1455 case Jump::jle: ASM(SETLE8r, dst);
break;
1456 case Jump::jmp: ASM(MOV32ri, dst, 1);
break;
1457 case Jump::jne: ASM(SETNZ8r, dst);
break;
1458 case Jump::jno: ASM(SETNO8r, dst);
break;
1459 case Jump::jo: ASM(SETO8r, dst);
break;
1460 case Jump::js: ASM(SETS8r, dst);
break;
1461 case Jump::jns: ASM(SETNS8r, dst);
break;
1462 case Jump::jp: ASM(SETP8r, dst);
break;
1463 case Jump::jnp: ASM(SETNP8r, dst);
break;
1467template <IRAdaptor Adaptor,
1469 template <
typename,
typename,
typename>
class BaseTy,
1471void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1472 Jump cc, AsmReg dst)
noexcept {
1474 generate_raw_set(cc, dst);
1477template <IRAdaptor Adaptor,
1479 template <
typename,
typename,
typename>
class BaseTy,
1481void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_cmov(
1482 Jump cc, AsmReg dst, AsmReg src,
bool is_64)
noexcept {
1483 this->text_writer.ensure_space(16);
1486 case Jump::ja: ASMNC(CMOVA64rr, dst, src);
break;
1487 case Jump::jae: ASMNC(CMOVNC64rr, dst, src);
break;
1488 case Jump::jb: ASMNC(CMOVC64rr, dst, src);
break;
1489 case Jump::jbe: ASMNC(CMOVBE64rr, dst, src);
break;
1490 case Jump::je: ASMNC(CMOVZ64rr, dst, src);
break;
1491 case Jump::jg: ASMNC(CMOVG64rr, dst, src);
break;
1492 case Jump::jge: ASMNC(CMOVGE64rr, dst, src);
break;
1493 case Jump::jl: ASMNC(CMOVL64rr, dst, src);
break;
1494 case Jump::jle: ASMNC(CMOVLE64rr, dst, src);
break;
1495 case Jump::jmp: ASMNC(MOV64rr, dst, src);
break;
1496 case Jump::jne: ASMNC(CMOVNZ64rr, dst, src);
break;
1497 case Jump::jno: ASMNC(CMOVNO64rr, dst, src);
break;
1498 case Jump::jo: ASMNC(CMOVO64rr, dst, src);
break;
1499 case Jump::js: ASMNC(CMOVS64rr, dst, src);
break;
1500 case Jump::jns: ASMNC(CMOVNS64rr, dst, src);
break;
1501 case Jump::jp: ASMNC(CMOVP64rr, dst, src);
break;
1502 case Jump::jnp: ASMNC(CMOVNP64rr, dst, src);
break;
1506 case Jump::ja: ASMNC(CMOVA32rr, dst, src);
break;
1507 case Jump::jae: ASMNC(CMOVNC32rr, dst, src);
break;
1508 case Jump::jb: ASMNC(CMOVC32rr, dst, src);
break;
1509 case Jump::jbe: ASMNC(CMOVBE32rr, dst, src);
break;
1510 case Jump::je: ASMNC(CMOVZ32rr, dst, src);
break;
1511 case Jump::jg: ASMNC(CMOVG32rr, dst, src);
break;
1512 case Jump::jge: ASMNC(CMOVGE32rr, dst, src);
break;
1513 case Jump::jl: ASMNC(CMOVL32rr, dst, src);
break;
1514 case Jump::jle: ASMNC(CMOVLE32rr, dst, src);
break;
1515 case Jump::jmp: ASMNC(MOV32rr, dst, src);
break;
1516 case Jump::jne: ASMNC(CMOVNZ32rr, dst, src);
break;
1517 case Jump::jno: ASMNC(CMOVNO32rr, dst, src);
break;
1518 case Jump::jo: ASMNC(CMOVO32rr, dst, src);
break;
1519 case Jump::js: ASMNC(CMOVS32rr, dst, src);
break;
1520 case Jump::jns: ASMNC(CMOVNS32rr, dst, src);
break;
1521 case Jump::jp: ASMNC(CMOVP32rr, dst, src);
break;
1522 case Jump::jnp: ASMNC(CMOVNP32rr, dst, src);
break;
1527template <IRAdaptor Adaptor,
1529 template <
typename,
typename,
typename>
class BaseTy,
1531void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1532 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to)
noexcept {
1533 assert(from < to && to <= 64);
1536 case 8: ASM(MOVZXr32r8, dst, src);
break;
1537 case 16: ASM(MOVZXr32r16, dst, src);
break;
1538 case 32: ASM(MOV32rr, dst, src);
break;
1542 ASM(MOV32rr, dst, src);
1544 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1545 }
else if (dst != src) {
1546 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1547 ASM(AND64rr, dst, src);
1549 ScratchReg tmp{
this};
1550 AsmReg tmp_reg = tmp.alloc_gp();
1551 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1552 ASM(AND64rr, dst, tmp_reg);
1555 }
else if (to <= 32) {
1557 case 8: ASM(MOVSXr32r8, dst, src);
break;
1558 case 16: ASM(MOVSXr32r16, dst, src);
break;
1561 ASM(MOV32rr, dst, src);
1563 ASM(SHL32ri, dst, 32 - from);
1564 ASM(SAR32ri, dst, 32 - from);
1568 case 8: ASM(MOVSXr64r8, dst, src);
break;
1569 case 16: ASM(MOVSXr64r16, dst, src);
break;
1570 case 32: ASM(MOVSXr64r32, dst, src);
break;
1573 ASM(MOV64rr, dst, src);
1575 ASM(SHL64ri, dst, 64 - from);
1576 ASM(SAR64ri, dst, 64 - from);
1581template <IRAdaptor Adaptor,
1583 template <
typename,
typename,
typename>
class BaseTy,
1585void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1586 set_stack_used() noexcept {
1587 if (stack_adjust_off == 0) {
1588 stack_adjust_off = this->compiler.text_writer.offset();
1590 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1591 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1595template <IRAdaptor Adaptor,
1597 template <
typename,
typename,
typename>
class BaseTy,
1599void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1600 ValuePart &vp, CCAssignment &cca)
noexcept {
1601 AsmReg ptr = vp.load_to_reg(&this->compiler);
1602 ScratchReg scratch{&this->compiler};
1603 AsmReg tmp = scratch.alloc_gp();
1605 auto size = cca.byval_size;
1609 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1610 ASMC(&this->compiler,
1612 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1618 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1619 ASMC(&this->compiler,
1621 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1627 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1628 ASMC(&this->compiler,
1630 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1636 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1637 ASMC(&this->compiler,
1639 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1644template <IRAdaptor Adaptor,
1646 template <
typename,
typename,
typename>
class BaseTy,
1648void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1649 ValuePart &vp, CCAssignment &cca)
noexcept {
1652 auto reg = vp.load_to_reg(&this->compiler);
1653 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1656 ASMC(&this->compiler,
1658 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1662 ASMC(&this->compiler,
1664 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1668 ASMC(&this->compiler,
1670 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1674 ASMC(&this->compiler,
1676 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1679 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1682 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1685 ASMC(&this->compiler,
1687 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1691 ASMC(&this->compiler,
1693 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1697 ASMC(&this->compiler,
1699 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1702 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1707template <IRAdaptor Adaptor,
1709 template <
typename,
typename,
typename>
class BaseTy,
1711void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1712 std::variant<typename Assembler::SymRef, ValuePart> &&target)
noexcept {
1713 if (this->assigner.is_vararg()) {
1714 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1715 this->compiler.evict_reg(Reg{AsmReg::AX});
1717 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1718 Config::FP_BANK, 0);
1719 unsigned xmm_cnt = 8;
1720 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1721 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1723 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1727 if (stack_adjust_off != 0) {
1728 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1729 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1730 memcpy(inst_ptr + 3, &sub,
sizeof(u32));
1732 assert(this->assigner.get_stack_size() == 0);
1735 if (
auto *sym = std::get_if<typename Assembler::SymRef>(&target)) {
1736 this->compiler.text_writer.ensure_space(16);
1737 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1738 this->compiler.reloc_text(
1739 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1741 ValuePart &tvp = std::get<ValuePart>(target);
1742 if (AsmReg reg = tvp.cur_reg_unlocked(); reg.valid()) {
1743 ASMC(&this->compiler, CALLr, reg);
1744 }
else if (tvp.has_assignment() && tvp.assignment().stack_valid()) {
1745 auto off = tvp.assignment().frame_off();
1746 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1748 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1749 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1750 ASMC(&this->compiler, CALLr, reg);
1752 tvp.reset(&this->compiler);
1755 if (stack_adjust_off != 0) {
1756 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1760template <IRAdaptor Adaptor,
1762 template <
typename,
typename,
typename>
typename BaseTy,
1764void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1765 std::variant<Assembler::SymRef, ValuePart> &&target,
1766 std::span<CallArg> arguments,
1767 typename Base::ValueRef *result,
1768 const bool variable_args) {
1769 CCAssignerSysV assigner{variable_args};
1770 CallBuilder cb{*
derived(), assigner};
1771 for (
auto &arg : arguments) {
1772 cb.add_arg(std::move(arg));
1774 cb.call(std::move(target));
1776 cb.add_ret(*result);
1780template <IRAdaptor Adaptor,
1782 template <
typename,
typename,
typename>
typename BaseTy,
1784CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1785 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1786 Assembler::SymRef sym, TLSModel model)
noexcept {
1789 case TLSModel::GlobalDynamic: {
1792 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1793 for (
auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1796 ScratchReg arg{
this};
1797 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1801 this->text_writer.ensure_space(0x10);
1802 *this->text_writer.cur_ptr()++ = 0x66;
1803 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1804 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1805 *this->text_writer.cur_ptr()++ = 0x66;
1806 *this->text_writer.cur_ptr()++ = 0x66;
1807 *this->text_writer.cur_ptr()++ = 0x48;
1808 ASMNC(CALL, this->text_writer.cur_ptr());
1809 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1810 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1811 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
1813 this->reloc_text(this->sym_tls_get_addr,
1815 this->text_writer.offset() - 4,
1819 ScratchReg res{
this};
1820 res.alloc_specific(AsmReg::AX);
void evict_reg(Reg reg) noexcept