6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/base.hpp"
10#include "tpde/x64/FunctionWriterX64.hpp"
15#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
16 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
21#define ASM_FULL(compiler, reserve, op, ...) \
22 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
24#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
25#define ASMC(compiler, op, ...) \
26 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
27#define ASMF(op, flag, ...) \
28 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
29#define ASMNCF(op, flag, ...) \
30 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
31#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
73 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
75 constexpr AsmReg(
const REG
id) noexcept : Reg((u8)
id) {}
77 constexpr AsmReg(
const Reg base) noexcept : Reg(base) {}
79 constexpr explicit AsmReg(
const u8
id) noexcept : Reg(
id) {
80 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
83 constexpr explicit AsmReg(
const u64
id) noexcept : Reg(
id) {
84 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
87 constexpr operator FeRegGP() const noexcept {
88 assert(reg_id <= R15);
89 return FeRegGP{reg_id};
92 operator FeRegGPLH() const noexcept {
93 assert(reg_id <= R15);
94 return FeRegGP{reg_id};
97 constexpr operator FeRegXMM() const noexcept {
98 assert(reg_id >= XMM0 && reg_id <= XMM15);
99 return FeRegXMM{
static_cast<u8
>(reg_id & 0x1F)};
104 create_bitmask(
const std::initializer_list<AsmReg::REG> regs) {
106 for (
const auto reg : regs) {
113constexpr static u64 create_bitmask(
const std::array<AsmReg, N> regs) {
115 for (
const auto reg : regs) {
116 set |= 1ull << reg.id();
122class CCAssignerSysV :
public CCAssigner {
124 static constexpr CCInfo Info{
126 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
127 .callee_saved_regs = create_bitmask({
134 .arg_regs = create_bitmask({
150 .red_zone_size = 128,
154 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
156 unsigned must_assign_stack = 0;
158 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
161 CCAssignerSysV(
bool vararg =
false) noexcept
162 : CCAssigner(Info), vararg(vararg) {}
164 void reset()
noexcept override {
165 gp_cnt = xmm_cnt = stack = 0;
166 must_assign_stack = 0;
168 ret_gp_cnt = ret_xmm_cnt = 0;
171 void assign_arg(CCAssignment &arg)
noexcept override {
173 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
174 arg.stack_off = stack;
179 if (arg.bank == RegBank{0}) {
180 static constexpr std::array<AsmReg, 6> gp_arg_regs{
188 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
189 arg.reg = gp_arg_regs[gp_cnt];
194 must_assign_stack = arg.consecutive + 1;
195 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
196 arg.stack_off = stack;
199 }
else if (arg.bank == RegBank{1}) {
200 if (!must_assign_stack && xmm_cnt < 8) {
201 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
206 must_assign_stack = arg.consecutive + 1;
207 u32 size = util::align_up(arg.size, 8);
208 stack = util::align_up(stack, size);
209 arg.stack_off = stack;
214 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
215 arg.stack_off = stack;
216 stack += util::align_up(arg.size, 8);
219 if (must_assign_stack > 0) {
220 must_assign_stack -= 1;
224 u32 get_stack_size()
noexcept override {
return stack; }
226 bool is_vararg()
const noexcept override {
return vararg; }
228 void assign_ret(CCAssignment &arg)
noexcept override {
229 assert(!arg.byval && !arg.sret);
230 if (arg.bank == RegBank{0}) {
231 if (ret_gp_cnt + arg.consecutive < 2) {
232 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
235 TPDE_UNREACHABLE(
"too many return values");
237 }
else if (arg.bank == RegBank{1}) {
238 if (ret_xmm_cnt + arg.consecutive < 2) {
239 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
242 TPDE_UNREACHABLE(
"too many return values");
245 TPDE_UNREACHABLE(
"return value must have valid register bank");
250struct PlatformConfig : CompilerConfigDefault {
251 using Assembler = tpde::elf::AssemblerElfX64;
252 using AsmReg = tpde::x64::AsmReg;
256 static constexpr RegBank GP_BANK{0};
257 static constexpr RegBank FP_BANK{1};
258 static constexpr bool FRAME_INDEXING_NEGATIVE =
true;
259 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
260 static constexpr u32 NUM_BANKS = 2;
264template <IRAdaptor Adaptor,
266 template <
typename,
typename,
typename>
typename BaseTy =
268 typename Config = PlatformConfig>
269struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
270 using Base = BaseTy<Adaptor, Derived, Config>;
272 using IRValueRef =
typename Base::IRValueRef;
273 using IRBlockRef =
typename Base::IRBlockRef;
274 using IRFuncRef =
typename Base::IRFuncRef;
276 using ScratchReg =
typename Base::ScratchReg;
277 using ValuePartRef =
typename Base::ValuePartRef;
278 using ValuePart =
typename Base::ValuePart;
279 using GenericValuePart =
typename Base::GenericValuePart;
281 using RegisterFile =
typename Base::RegisterFile;
283 using CallArg =
typename Base::CallArg;
290 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
293 enum CPU_FEATURES : u32 {
295 CPU_CMPXCHG16B = (1 << 0),
296 CPU_POPCNT = (1 << 1),
298 CPU_SSSE3 = (1 << 3),
299 CPU_SSE4_1 = (1 << 4),
300 CPU_SSE4_2 = (1 << 5),
305 CPU_F16C = (1 << 10),
307 CPU_LZCNT = (1 << 12),
308 CPU_MOVBE = (1 << 13),
309 CPU_AVX512F = (1 << 14),
310 CPU_AVX512BW = (1 << 15),
311 CPU_AVX512CD = (1 << 16),
312 CPU_AVX512DQ = (1 << 17),
313 CPU_AVX512VL = (1 << 18),
315 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
316 CPU_SSE4_1 | CPU_SSE4_2,
317 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
318 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
319 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
323 CPU_FEATURES cpu_feats = CPU_BASELINE;
332 u64 fixed_assignment_nonallocatable_mask =
333 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
334 u32 func_start_off = 0u, func_prologue_alloc = 0u;
338 u32 reg_save_frame_off = 0;
339 u32 var_arg_stack_off = 0;
340 util::SmallVector<u32, 8> func_ret_offs = {};
354 class CallBuilder :
public Base::template CallBuilderBase<CallBuilder> {
355 u32 stack_adjust_off = 0;
357 void set_stack_used()
noexcept;
362 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
364 void add_arg_byval(ValuePart &vp, CCAssignment &cca)
noexcept;
365 void add_arg_stack(ValuePart &vp, CCAssignment &cca)
noexcept;
366 void call_impl(std::variant<SymRef, ValuePart> &&target)
noexcept;
367 void reset_stack() noexcept;
371 explicit CompilerX64(Adaptor *adaptor,
372 const CPU_FEATURES cpu_features = CPU_BASELINE)
373 : Base{adaptor}, cpu_feats(cpu_features) {
374 static_assert(std::is_base_of_v<CompilerX64, Derived>);
377 template <
typename... Args>
378 auto asm_helper(
unsigned (*enc_fn)(u8 *,
int, Args...)) {
380 CompilerX64 *compiler;
382 void encode(
unsigned reserve,
int flags, Args... args) {
384 compiler->text_writer.ensure_space(reserve);
386 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
388 compiler->text_writer.cur_ptr() += n;
391 return Helper{
this, enc_fn};
394 void start_func(u32 func_idx)
noexcept;
401 CCAssignment cca)
noexcept;
405 void finish_func(u32 func_idx)
noexcept;
407 void reset() noexcept;
411 void gen_func_epilog() noexcept;
413 void set_preserve_flags(
bool preserve) noexcept {
preserve_flags = preserve; }
417 spill_reg(
const AsmReg reg,
const i32 frame_off,
const u32 size)
noexcept;
419 void load_from_stack(AsmReg dst,
422 bool sign_extend =
false) noexcept;
424 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
426 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
428 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
429 assert(ap.stack_valid() && !ap.variable_ref());
430 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
433 AsmReg gval_expr_as_reg(GenericValuePart &gv)
noexcept;
436 void alloca_fixed(u64 size, u32 align, ValuePart &res)
noexcept;
443 ValuePart &res)
noexcept;
449 AsmReg dst)
noexcept;
451 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef)
noexcept;
477 FeCond jump_to_cond(
Jump jmp)
noexcept;
493 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to) noexcept;
512 std::span<CallArg> arguments,
513 typename Base::ValueRef *result,
514 bool variable_args = false);
518 void switch_emit_cmp(AsmReg cmp_reg,
521 bool width_is_32) noexcept;
525 void switch_emit_cmpeq(Label case_label,
529 bool width_is_32) noexcept;
531 bool switch_emit_jump_table(Label default_label,
532 std::span<const Label> labels,
537 bool width_is_32) noexcept;
539 void switch_emit_binary_step(Label case_label,
544 bool width_is_32) noexcept;
548 ScratchReg
tls_get_addr(SymRef sym, TLSModel model) noexcept;
550 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
551 return ((cpu_feats & feats) == feats);
555template <IRAdaptor Adaptor,
557 template <
typename,
typename,
typename>
class BaseTy,
559void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
560 const u32 )
noexcept {
561 this->preserve_flags =
false;
564template <IRAdaptor Adaptor,
566 template <
typename,
typename,
typename>
typename BaseTy,
569 CCAssigner *cc_assigner)
noexcept {
570 func_ret_offs.clear();
571 func_start_off = this->text_writer.offset();
574 const CCInfo &cc_info = cc_assigner->get_ccinfo();
576 auto csr = cc_info.callee_saved_regs;
577 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
578 "non-gp callee-saved registers not implemented");
580 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
581 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
583 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
584 this->stack.frame_size = 8 * (csr_logp + csr_higp);
588 func_prologue_alloc = reg_save_size + 11;
589 this->text_writer.ensure_space(func_prologue_alloc);
590 this->text_writer.cur_ptr() += func_prologue_alloc;
594 if (this->adaptor->cur_is_vararg()) {
595 this->stack.frame_used =
true;
596 this->stack.frame_size += 6 * 8 + 8 * 16;
597 reg_save_frame_off = this->stack.frame_size;
598 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
599 ASM(MOV64mr, mem, FE_DI);
601 ASM(MOV64mr, mem, FE_SI);
603 ASM(MOV64mr, mem, FE_DX);
605 ASM(MOV64mr, mem, FE_CX);
607 ASM(MOV64mr, mem, FE_R8);
609 ASM(MOV64mr, mem, FE_R9);
610 auto skip_fp = this->text_writer.label_create();
611 ASM(TEST8rr, FE_AX, FE_AX);
614 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
616 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
618 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
620 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
622 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
624 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
626 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
628 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
635 template <
typename,
typename,
typename>
typename BaseTy,
639 ValuePart &&vp, CCAssignment cca)
noexcept {
640 if (cca.reg.valid()) [[likely]] {
641 vp.set_value_reg(
this, cca.reg);
645 this->register_file.allocatable |= u64{1} << cca.reg.id();
654 this->stack.frame_used =
true;
655 i32 frame_off = 0x10 + cca.stack_off;
658 }
else if (vp.assignment().assignment()->part_count == 1 &&
659 !vp.assignment().register_valid()) {
665 vp.assignment().set_stack_valid();
666 vp.assignment().assignment()->frame_off = frame_off;
668 AsmReg dst = vp.alloc_reg(
this);
669 this->load_from_stack(dst, frame_off, cca.size);
676 template <
typename,
typename,
typename>
typename BaseTy,
679 CCAssigner *cc_assigner)
noexcept {
680 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
682 const CCInfo &cc_info = cc_assigner->get_ccinfo();
683 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
684 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
685 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
686 this->scalar_arg_count = std::popcount(gp_regs);
687 this->vec_arg_count = std::popcount(xmm_regs);
688 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
694 template <
typename,
typename,
typename>
typename BaseTy,
696void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
697 u32 func_idx)
noexcept {
698 const CCInfo &ccinfo =
derived()->cur_cc_assigner()->get_ccinfo();
699 auto csr = ccinfo.callee_saved_regs;
700 u64 saved_regs = this->register_file.clobbered & csr;
702 bool needs_stack_frame = this->stack.frame_used ||
703 this->stack.generated_call ||
704 this->stack.has_dynamic_alloca || saved_regs != 0;
706 u32 prologue_size = 0;
707 u32 num_saved_regs = 0;
708 u32 rsp_adjustment = 0;
711 this->text_writer.eh_begin_fde(this->get_personality_sym());
713 if (needs_stack_frame) {
714 if (!func_ret_offs.empty()) {
715 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
718 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
719 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
720 this->text_writer.eh_write_inst(
721 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
723 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
724 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
725 dwarf::x64::DW_reg_rbp);
728 auto fde_prologue_adv_off = this->text_writer.eh_writer.size();
729 if (saved_regs != 0) {
730 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
733 auto *write_ptr = this->text_writer.begin_ptr() + func_start_off;
734 write_ptr += fe64_PUSHr(write_ptr, 0, FE_BP);
735 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_BP, FE_SP);
736 for (
auto reg : util::BitSetIterator{saved_regs}) {
737 assert(reg <= AsmReg::R15);
739 fe64_PUSHr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
745 static const u8 gpreg_to_dwarf[] = {
746 dwarf::x64::DW_reg_rax,
747 dwarf::x64::DW_reg_rcx,
748 dwarf::x64::DW_reg_rdx,
749 dwarf::x64::DW_reg_rbx,
750 dwarf::x64::DW_reg_rsp,
751 dwarf::x64::DW_reg_rbp,
752 dwarf::x64::DW_reg_rsi,
753 dwarf::x64::DW_reg_rdi,
754 dwarf::x64::DW_reg_r8,
755 dwarf::x64::DW_reg_r9,
756 dwarf::x64::DW_reg_r10,
757 dwarf::x64::DW_reg_r11,
758 dwarf::x64::DW_reg_r12,
759 dwarf::x64::DW_reg_r13,
760 dwarf::x64::DW_reg_r14,
761 dwarf::x64::DW_reg_r15,
763 u8 dwarf_reg = gpreg_to_dwarf[reg];
764 auto cfa_off = num_saved_regs + 2;
765 this->text_writer.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
770 "stack with dynamic alloca must adjust stack pointer at call sites");
773 u32 final_frame_size =
775 rsp_adjustment = final_frame_size - num_saved_regs * 8;
776 bool needs_rsp_adjustment = this->stack.generated_call ||
777 this->stack.has_dynamic_alloca ||
778 rsp_adjustment > ccinfo.red_zone_size;
780 if (needs_rsp_adjustment) {
781 write_ptr += fe64_SUB64ri(write_ptr, 0, FE_SP, rsp_adjustment);
787 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
788 assert(prologue_size <= func_prologue_alloc);
789 if (saved_regs != 0) {
790 assert(prologue_size < 0x44 &&
"cannot encode too large prologue in CFI");
791 this->text_writer.eh_writer.data()[fde_prologue_adv_off] =
792 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
796 if (!func_ret_offs.empty()) {
797 u8 *text_data = this->text_writer.begin_ptr();
798 if (func_ret_offs.back() == this->text_writer.offset() - 5) {
799 this->text_writer.cur_ptr() -= 5;
800 func_ret_offs.pop_back();
802 for (
auto ret_off : func_ret_offs) {
803 fe64_JMP(text_data + ret_off, FE_JMPL, this->text_writer.cur_ptr());
809 this->text_writer.ensure_space(prologue_size + 1);
810 if (needs_stack_frame) {
811 if (this->stack.has_dynamic_alloca) {
812 if (num_saved_regs == 0) {
813 ASMNC(MOV64rr, FE_SP, FE_BP);
815 i32 reg_save_size = num_saved_regs * 8;
816 ASMNC(LEA64rm, FE_SP, FE_MEM(FE_BP, 0, FE_NOREG, -reg_save_size));
818 }
else if (rsp_adjustment != 0) {
819 ASMNC(ADD64ri, FE_SP, rsp_adjustment);
823 assert(ccinfo.red_zone_size >= num_saved_regs * 8 &&
824 "unwind info incorrect for calling conv without red zone");
825 for (
auto reg : util::BitSetIterator<true>{saved_regs}) {
826 ASMNC(POPr, AsmReg(reg));
830 u32 body_start = func_start_off + func_prologue_alloc;
831 this->text_writer.eh_advance(this->text_writer.offset() - body_start);
832 this->text_writer.eh_write_inst(dwarf::DW_CFA_restore_state);
840 this->text_writer.remove_prologue_bytes(func_start_off + prologue_size,
841 func_prologue_alloc - prologue_size);
842 auto func_size = this->text_writer.offset() - func_start_off;
843 auto func_sym = this->func_syms[func_idx];
844 auto func_sec = this->text_writer.get_sec_ref();
845 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
846 this->text_writer.eh_end_fde();
847 this->text_writer.except_encode_func();
850template <IRAdaptor Adaptor,
852 template <
typename,
typename,
typename>
typename BaseTy,
854void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
855 func_ret_offs.clear();
860template <IRAdaptor Adaptor,
862 template <
typename,
typename,
typename>
typename BaseTy,
864void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
866 func_ret_offs.push_back(this->text_writer.offset());
867 this->text_writer.ensure_space(5);
868 this->text_writer.cur_ptr() += 5;
871template <IRAdaptor Adaptor,
873 template <
typename,
typename,
typename>
typename BaseTy,
875void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
876 const AsmReg reg,
const i32 frame_off,
const u32 size)
noexcept {
877 assert(this->stack.frame_used);
878 this->text_writer.ensure_space(16);
879 assert(frame_off < 0);
880 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
881 if (reg.id() <= AsmReg::R15) {
883 case 1: ASMNC(MOV8mr, mem, reg);
break;
884 case 2: ASMNC(MOV16mr, mem, reg);
break;
885 case 4: ASMNC(MOV32mr, mem, reg);
break;
886 case 8: ASMNC(MOV64mr, mem, reg);
break;
887 default: TPDE_UNREACHABLE(
"invalid spill size");
893 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg);
break;
894 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg);
break;
895 case 16: ASMNC(SSE_MOVAPDmr, mem, reg);
break;
896 default: TPDE_UNREACHABLE(
"invalid spill size");
900template <IRAdaptor Adaptor,
902 template <
typename,
typename,
typename>
typename BaseTy,
904void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
908 const bool sign_extend)
noexcept {
909 assert(this->stack.frame_used);
910 this->text_writer.ensure_space(16);
911 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
913 if (dst.id() <= AsmReg::R15) {
916 case 1: ASMNC(MOVZXr32m8, dst, mem);
break;
917 case 2: ASMNC(MOVZXr32m16, dst, mem);
break;
918 case 4: ASMNC(MOV32rm, dst, mem);
break;
919 case 8: ASMNC(MOV64rm, dst, mem);
break;
920 default: TPDE_UNREACHABLE(
"invalid spill size");
924 case 1: ASMNC(MOVSXr64m8, dst, mem);
break;
925 case 2: ASMNC(MOVSXr64m16, dst, mem);
break;
926 case 4: ASMNC(MOVSXr64m32, dst, mem);
break;
927 case 8: ASMNC(MOV64rm, dst, mem);
break;
928 default: TPDE_UNREACHABLE(
"invalid spill size");
934 assert(!sign_extend);
937 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem);
break;
938 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem);
break;
939 case 16: ASMNC(SSE_MOVAPDrm, dst, mem);
break;
940 default: TPDE_UNREACHABLE(
"invalid spill size");
944template <IRAdaptor Adaptor,
946 template <
typename,
typename,
typename>
typename BaseTy,
948void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
949 const AsmReg dst,
const AssignmentPartRef ap)
noexcept {
950 assert(this->stack.frame_used);
951 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
954template <IRAdaptor Adaptor,
956 template <
typename,
typename,
typename>
typename BaseTy,
958void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
959 const AsmReg dst,
const AsmReg src,
const u32 size)
noexcept {
960 this->text_writer.ensure_space(16);
963 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
965 ASMNC(MOV64rr, dst, src);
967 ASMNC(MOV32rr, dst, src);
969 }
else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
971 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
972 assert(has_cpu_feats(CPU_AVX512F));
973 ASMNC(VMOVAPD128rr, dst, src);
975 ASMNC(SSE_MOVAPDrr, dst, src);
977 }
else if (size <= 32) {
978 assert(has_cpu_feats(CPU_AVX));
979 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
980 has_cpu_feats(CPU_AVX512F));
981 ASMNC(VMOVAPD256rr, dst, src);
984 assert(has_cpu_feats(CPU_AVX512F));
985 ASMNC(VMOVAPD512rr, dst, src);
987 }
else if (dst.id() <= AsmReg::R15) {
989 assert(src.id() >= AsmReg::XMM0);
991 if (src.id() > AsmReg::XMM15) {
992 assert(has_cpu_feats(CPU_AVX512F));
994 ASMNC(VMOVD_X2Grr, dst, src);
996 ASMNC(VMOVQ_X2Grr, dst, src);
1000 ASMNC(SSE_MOVD_X2Grr, dst, src);
1002 ASMNC(SSE_MOVQ_X2Grr, dst, src);
1007 assert(src.id() <= AsmReg::R15);
1008 assert(dst.id() >= AsmReg::XMM0);
1010 if (dst.id() > AsmReg::XMM15) {
1011 assert(has_cpu_feats(CPU_AVX512F));
1013 ASMNC(VMOVD_G2Xrr, dst, src);
1015 ASMNC(VMOVQ_G2Xrr, dst, src);
1019 ASMNC(SSE_MOVD_G2Xrr, dst, src);
1021 ASMNC(SSE_MOVQ_G2Xrr, dst, src);
1027template <IRAdaptor Adaptor,
1029 template <
typename,
typename,
typename>
typename BaseTy,
1031AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1032 GenericValuePart &gv)
noexcept {
1033 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1035 ScratchReg scratch{
derived()};
1036 bool disp32 = i32(expr.disp) == expr.disp;
1037 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1038 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1039 if (std::holds_alternative<ScratchReg>(expr.base)) {
1040 scratch = std::move(std::get<ScratchReg>(expr.base));
1041 }
else if (std::holds_alternative<ScratchReg>(expr.index)) {
1042 scratch = std::move(std::get<ScratchReg>(expr.index));
1044 (void)scratch.alloc_gp();
1046 auto dst = scratch.cur_reg();
1048 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1050 if (base.valid() && disp32) {
1051 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1053 }
else if (base.valid()) {
1054 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1055 }
else if (disp32) {
1056 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1058 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1061 assert(may_clobber_flags());
1062 u64 scale = expr.scale;
1064 base = AsmReg::make_invalid();
1068 ScratchReg idx_scratch{
derived()};
1070 AsmReg idx_tmp = dst;
1071 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1074 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1075 }
else if (dst == base) {
1076 idx_tmp = idx_scratch.alloc_gp();
1079 if ((scale & (scale - 1)) == 0) {
1080 if (idx_tmp != idx) {
1081 ASM(MOV64rr, idx_tmp, idx);
1083 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1085 if (i32(scale) == i64(scale)) {
1086 ASM(IMUL64rri, idx_tmp, idx, scale);
1088 ScratchReg scratch2{
derived()};
1089 auto tmp2 = scratch2.alloc_gp();
1090 ASM(MOV64ri, tmp2, scale);
1091 if (idx_tmp != idx) {
1092 ASM(MOV64rr, idx_tmp, idx);
1094 ASM(IMUL64rr, idx_tmp, tmp2);
1098 if (disp32 || (idx_tmp != dst && base != dst)) {
1099 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1101 }
else if (dst == base) {
1102 ASM(ADD64rr, dst, idx_tmp);
1104 ASM(ADD64rr, dst, base);
1108 }
else if (base.valid()) {
1109 if (expr.disp && disp32) {
1110 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1112 }
else if (dst != base) {
1113 ASM(MOV64rr, dst, base);
1117 ScratchReg scratch2{
derived()};
1118 auto tmp2 = scratch2.alloc_gp();
1119 ASM(MOV64ri, tmp2, expr.disp);
1120 if (may_clobber_flags()) {
1121 ASM(ADD64rr, dst, tmp2);
1123 ASM(LEA64rm, dst, FE_MEM(dst, 1, tmp2, 0));
1126 gv.state = std::move(scratch);
1130template <IRAdaptor Adaptor,
1132 template <
typename,
typename,
typename>
typename BaseTy,
1135 u64 size, u32 align, ValuePart &res)
noexcept {
1136 assert(this->stack.has_dynamic_alloca &&
1137 "function marked as not having dynamic allocas can't have alloca");
1138 assert(align != 0 && (align & (align - 1)) == 0 &&
"invalid alignment");
1139 assert(may_clobber_flags());
1140 size = tpde::util::align_up(size, 16);
1142 assert(size < 0x8000'0000);
1143 ASM(SUB64ri, FE_SP, size);
1146 assert(align < u32{1} << 31 &&
"alignment >= 2**31 not implemented");
1147 ASM(AND64ri, FE_SP, ~(align - 1));
1149 ASM(MOV64rr, res.alloc_reg(
this), FE_SP);
1154 template <
typename,
typename,
typename>
typename BaseTy,
1157 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res)
noexcept {
1158 assert(this->stack.has_dynamic_alloca &&
1159 "function marked as not having dynamic allocas can't have alloca");
1160 assert(align != 0 && (align & (align - 1)) == 0 &&
"invalid alignment");
1161 assert(may_clobber_flags());
1162 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(
this);
1163 AsmReg res_reg = res.alloc_try_reuse(
this, count);
1165 if (elem_size == 0) {
1166 ASM(XOR32rr, res_reg, res_reg);
1167 }
else if ((elem_size & (elem_size - 1)) == 0) {
1169 const auto shift = util::cnt_tz(elem_size);
1170 if (shift > 0 && shift < 4) {
1171 ASM(LEA64rm, res_reg, FE_MEM(FE_NOREG, u8(1 << shift), size_reg, 0));
1173 if (size_reg != res_reg) {
1174 ASM(MOV64rr, res_reg, size_reg);
1176 if (elem_size != 1) {
1177 ASM(SHL64ri, res_reg, shift);
1181 if (elem_size <= 0x7FFF'FFFF) [[likely]] {
1182 ASM(IMUL64rri, res_reg, size_reg, elem_size);
1184 ScratchReg scratch{
this};
1185 auto tmp = scratch.alloc_gp();
1186 ASM(MOV64ri, tmp, elem_size);
1187 if (size_reg != res_reg) {
1188 ASM(MOV64rr, res_reg, size_reg);
1190 ASM(IMUL64rr, res_reg, tmp);
1194 ASM(SUB64rr, FE_SP, res_reg);
1196 align = align > 16 ? align : 16;
1197 if (elem_size & (align - 1)) {
1198 assert(align < u32{1} << 31 &&
"alignment >= 2**31 not implemented");
1199 ASM(AND64ri, FE_SP, ~(align - 1));
1202 ASM(MOV64rr, res_reg, FE_SP);
1207 template <
typename,
typename,
typename>
typename BaseTy,
1210 const u64 *data,
const RegBank bank,
const u32 size, AsmReg dst)
noexcept {
1211 const auto const_u64 = data[0];
1212 if (bank == Config::GP_BANK) {
1214 if (const_u64 == 0) {
1215 if (may_clobber_flags()) {
1216 ASM(XOR32rr, dst, dst);
1218 ASM(MOV32ri, dst, 0);
1223 if (size <= 4 || u32(const_u64) == const_u64) {
1224 ASM(MOV32ri, dst, const_u64);
1226 ASM(MOV64ri, dst, const_u64);
1231 assert(bank == Config::FP_BANK);
1232 const auto high_u64 = size <= 8 ? 0 : data[1];
1233 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1234 if (has_cpu_feats(CPU_AVX)) {
1235 ASM(VPXOR128rrr, dst, dst, dst);
1237 ASM(SSE_PXORrr, dst, dst);
1241 const u64 ones = -u64{1};
1242 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1243 if (has_cpu_feats(CPU_AVX)) {
1244 ASM(VPCMPEQB128rrr, dst, dst, dst);
1246 ASM(SSE_PCMPEQBrr, dst, dst);
1255 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1257 this->register_file.mark_clobbered(tmp);
1260 if (has_cpu_feats(CPU_AVX)) {
1261 ASM(VMOVD_G2Xrr, dst, tmp);
1263 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1266 if (has_cpu_feats(CPU_AVX)) {
1267 ASM(VMOVQ_G2Xrr, dst, tmp);
1269 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1278 auto alloc_size = util::align_up(size, 8);
1279 std::span<const u8> raw_data{
reinterpret_cast<const u8 *
>(data), alloc_size};
1281 auto rodata = this->assembler.get_data_section(
true,
false);
1282 auto sym = this->assembler.sym_def_data(
1285 if (has_cpu_feats(CPU_AVX)) {
1286 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1288 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1290 }
else if (size <= 8) {
1291 if (has_cpu_feats(CPU_AVX)) {
1292 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1294 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1296 }
else if (size <= 16) {
1297 if (has_cpu_feats(CPU_AVX)) {
1298 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1300 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1304 TPDE_FATAL(
"unable to materialize constant");
1307 this->reloc_text(sym, elf::R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1312 template <
typename,
typename,
typename>
typename BaseTy,
1315 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1316 AssignmentPartRef ap, IRValueRef)
noexcept {
1317 RegBank bank = ap.bank();
1318 assert(bank.id() <= Config::NUM_BANKS);
1319 auto reg_mask = this->register_file.bank_regs(bank);
1320 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1322 const auto find_possible_regs = [
this,
1323 reg_mask](
const u64 preferred_regs) -> u64 {
1325 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1326 return free_regs & preferred_regs & reg_mask;
1330 auto csr =
derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1331 if (!this->stack.is_leaf_function) {
1333 possible_regs = find_possible_regs(csr);
1337 possible_regs = find_possible_regs(~csr);
1338 if (possible_regs == 0) {
1340 possible_regs = find_possible_regs(csr);
1344 if (possible_regs == 0) {
1345 return AsmReg::make_invalid();
1349 if ((possible_regs & ~this->register_file.used) != 0) {
1350 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1353 for (
const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1354 const auto reg = AsmReg{reg_id};
1356 if (this->register_file.is_fixed(reg)) {
1360 const auto local_idx = this->register_file.reg_local_idx(reg);
1361 const auto part = this->register_file.reg_part(reg);
1363 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1366 auto *assignment = this->val_assignment(local_idx);
1367 auto ap = AssignmentPartRef{assignment, part};
1368 if (ap.modified()) {
1375 return AsmReg::make_invalid();
1378template <IRAdaptor Adaptor,
1380 template <
typename,
typename,
typename>
typename BaseTy,
1382typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1383 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1402 default: TPDE_UNREACHABLE(
"invalid jump kind for invert_jump");
1406template <IRAdaptor Adaptor,
1408 template <
typename,
typename,
typename>
class BaseTy,
1410typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1411 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1424 default: TPDE_UNREACHABLE(
"invalid jump kind for swap_jump");
1428template <IRAdaptor Adaptor,
1430 template <
typename,
typename,
typename>
class BaseTy,
1432FeCond CompilerX64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(
1435 FeCond res = FeCond(u32(
jmp) << 16);
1437 case Jump::ja: assert(res == FE_CC_A &&
"FeCond value mismatch?");
break;
1438 case Jump::jae: assert(res == FE_CC_AE &&
"FeCond value mismatch?");
break;
1439 case Jump::jb: assert(res == FE_CC_B &&
"FeCond value mismatch?");
break;
1440 case Jump::jbe: assert(res == FE_CC_BE &&
"FeCond value mismatch?");
break;
1441 case Jump::je: assert(res == FE_CC_E &&
"FeCond value mismatch?");
break;
1442 case Jump::jg: assert(res == FE_CC_G &&
"FeCond value mismatch?");
break;
1443 case Jump::jge: assert(res == FE_CC_GE &&
"FeCond value mismatch?");
break;
1444 case Jump::jl: assert(res == FE_CC_L &&
"FeCond value mismatch?");
break;
1445 case Jump::jle: assert(res == FE_CC_LE &&
"FeCond value mismatch?");
break;
1446 case Jump::jne: assert(res == FE_CC_NE &&
"FeCond value mismatch?");
break;
1447 case Jump::jno: assert(res == FE_CC_NO &&
"FeCond value mismatch?");
break;
1448 case Jump::jo: assert(res == FE_CC_O &&
"FeCond value mismatch?");
break;
1449 case Jump::js: assert(res == FE_CC_S &&
"FeCond value mismatch?");
break;
1450 case Jump::jns: assert(res == FE_CC_NS &&
"FeCond value mismatch?");
break;
1451 case Jump::jp: assert(res == FE_CC_P &&
"FeCond value mismatch?");
break;
1452 case Jump::jnp: assert(res == FE_CC_NP &&
"FeCond value mismatch?");
break;
1453 default: TPDE_UNREACHABLE(
"invalid conditional jump");
1458template <IRAdaptor Adaptor,
1460 template <
typename,
typename,
typename>
typename BaseTy,
1463 Jump jmp, Label target_label)
noexcept {
1464 this->text_writer.ensure_space(6);
1465 bool pending = this->text_writer.label_is_pending(target_label);
1466 void *target = this->text_writer.cur_ptr();
1468 target = this->text_writer.begin_ptr() +
1469 this->text_writer.label_offset(target_label);
1473 ASMNCF(JMP, pending ? FE_JMPL : 0, target);
1475 ASMNCF(Jcc, (pending ? FE_JMPL : 0) | jump_to_cond(
jmp), target);
1479 this->text_writer.label_ref(target_label,
1480 this->text_writer.offset() - 4,
1481 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1487 template <
typename,
typename,
typename>
class BaseTy,
1490 Jump cc, AsmReg dst,
bool zext)
noexcept {
1492 ASM(MOV32ri, dst, 0);
1494 ASMF(SETcc8r, jump_to_cond(cc), dst);
1499 template <
typename,
typename,
typename>
class BaseTy,
1502 Jump cc, AsmReg dst)
noexcept {
1509 template <
typename,
typename,
typename>
class BaseTy,
1512 Jump cc, AsmReg dst, AsmReg src,
bool is_64)
noexcept {
1514 ASMF(CMOVcc64rr, jump_to_cond(cc), dst, src);
1516 ASMF(CMOVcc32rr, jump_to_cond(cc), dst, src);
1522 template <
typename,
typename,
typename>
class BaseTy,
1525 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to)
noexcept {
1526 assert(from < to && to <= 64);
1527 assert(may_clobber_flags());
1530 case 8: ASM(MOVZXr32r8, dst, src);
break;
1531 case 16: ASM(MOVZXr32r16, dst, src);
break;
1532 case 32: ASM(MOV32rr, dst, src);
break;
1536 ASM(MOV32rr, dst, src);
1538 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1539 }
else if (dst != src) {
1540 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1541 ASM(AND64rr, dst, src);
1543 ScratchReg tmp{
this};
1544 AsmReg tmp_reg = tmp.alloc_gp();
1545 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1546 ASM(AND64rr, dst, tmp_reg);
1549 }
else if (to <= 32) {
1551 case 8: ASM(MOVSXr32r8, dst, src);
break;
1552 case 16: ASM(MOVSXr32r16, dst, src);
break;
1555 ASM(MOV32rr, dst, src);
1557 ASM(SHL32ri, dst, 32 - from);
1558 ASM(SAR32ri, dst, 32 - from);
1562 case 8: ASM(MOVSXr64r8, dst, src);
break;
1563 case 16: ASM(MOVSXr64r16, dst, src);
break;
1564 case 32: ASM(MOVSXr64r32, dst, src);
break;
1567 ASM(MOV64rr, dst, src);
1569 ASM(SHL64ri, dst, 64 - from);
1570 ASM(SAR64ri, dst, 64 - from);
1577 template <
typename,
typename,
typename>
class BaseTy,
1580 AsmReg dst, AsmReg src, u32 lsb, u32 width)
noexcept {
1581 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1582 assert(may_clobber_flags());
1583 ScratchReg tmp1{
this};
1584 AsmReg tmp1_reg = tmp1.alloc_gp();
1587 ASM(BTR64ri, dst, lsb);
1588 }
else if (lsb + width <= 31) {
1589 ASM(AND64ri, dst, ~(((u64{1} << width) - 1) << lsb));
1591 ASM(MOV64ri, tmp1_reg, ~(((u64{1} << width) - 1) << lsb));
1592 ASM(AND64rr, dst, tmp1_reg);
1596 ASM(MOVZXr32r8, tmp1_reg, src);
1597 }
else if (width == 16) {
1598 ASM(MOVZXr32r16, tmp1_reg, src);
1599 }
else if (width <= 32) {
1600 ASM(MOV32rr, tmp1_reg, src);
1602 ASM(AND32ri, tmp1_reg, (u32{1} << width) - 1);
1605 ASM(MOV64ri, tmp1_reg, (u64{1} << width) - 1);
1606 ASM(AND64rr, tmp1_reg, src);
1609 if (lsb >= 1 && lsb <= 3) {
1610 ASM(LEA64rm, dst, FE_MEM(dst, u8(1 << lsb), tmp1_reg, 0));
1612 if (lsb > 0 && lsb + width <= 32) {
1613 ASM(SHL32ri, tmp1_reg, lsb);
1614 }
else if (lsb > 0) {
1615 ASM(SHL64ri, tmp1_reg, lsb);
1617 ASM(OR64rr, dst, tmp1_reg);
1623 template <
typename,
typename,
typename>
class BaseTy,
1626 AsmReg dst, AsmReg src, u32 lsb, u32 width)
noexcept {
1627 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1629 assert(may_clobber_flags());
1632 ASM(MOVZXr32r8, dst, src);
1633 }
else if (width == 16) {
1634 ASM(MOVZXr32r16, dst, src);
1635 }
else if (width <= 32) {
1636 ASM(MOV32rr, dst, src);
1638 ASM(AND32ri, dst, (u32{1} << width) - 1);
1641 ASM(MOV64ri, dst, (u64{1} << width) - 1);
1642 ASM(AND64rr, dst, src);
1645 if (lsb > 0 && lsb + width <= 32) {
1646 ASM(SHL32ri, dst, lsb);
1647 }
else if (lsb > 0) {
1648 ASM(SHL64ri, dst, lsb);
1654 template <
typename,
typename,
typename>
class BaseTy,
1658 if (this->compiler.stack.has_dynamic_alloca && stack_adjust_off == 0) {
1659 stack_adjust_off = this->compiler.text_writer.offset();
1661 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1662 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1666template <IRAdaptor Adaptor,
1668 template <
typename,
typename,
typename>
class BaseTy,
1670void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1671 ValuePart &vp, CCAssignment &cca)
noexcept {
1672 AsmReg ptr = vp.load_to_reg(&this->compiler);
1673 ScratchReg scratch{&this->compiler};
1674 AsmReg tmp = scratch.alloc_gp();
1676 auto size = cca.size;
1679 i32 soff = cca.stack_off;
1681 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1682 ASMC(&this->compiler, MOV64mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1687 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1688 ASMC(&this->compiler, MOV32mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1693 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1694 ASMC(&this->compiler, MOV16mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1699 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1700 ASMC(&this->compiler, MOV8mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1704template <IRAdaptor Adaptor,
1706 template <
typename,
typename,
typename>
class BaseTy,
1708void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1709 ValuePart &vp, CCAssignment &cca)
noexcept {
1712 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
1713 FeMem mem_op = FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off));
1714 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1716 case 1: ASMC(&this->compiler, MOV8mr, mem_op, reg);
break;
1717 case 2: ASMC(&this->compiler, MOV16mr, mem_op, reg);
break;
1718 case 4: ASMC(&this->compiler, MOV32mr, mem_op, reg);
break;
1719 case 8: ASMC(&this->compiler, MOV64mr, mem_op, reg);
break;
1720 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1723 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1725 case 4: ASMC(&this->compiler, SSE_MOVSSmr, mem_op, reg);
break;
1726 case 8: ASMC(&this->compiler, SSE_MOVSDmr, mem_op, reg);
break;
1727 case 16: ASMC(&this->compiler, SSE_MOVDQAmr, mem_op, reg);
break;
1728 default: TPDE_UNREACHABLE(
"invalid SSE reg size");
1733template <IRAdaptor Adaptor,
1735 template <
typename,
typename,
typename>
class BaseTy,
1737void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1738 std::variant<SymRef, ValuePart> &&target)
noexcept {
1739 if (this->assigner.is_vararg()) {
1740 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1741 this->compiler.evict_reg(Reg{AsmReg::AX});
1743 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1744 Config::FP_BANK, 0);
1745 unsigned xmm_cnt = 8;
1746 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1747 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1750 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1752 ASMC(&this->compiler, XOR32rr, FE_AX, FE_AX);
1757 if (stack_adjust_off != 0) {
1758 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1759 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1760 memcpy(inst_ptr + 3, &sub,
sizeof(u32));
1762 auto &max_stack_size = this->compiler.max_callee_stack_arg_size;
1763 max_stack_size = std::max(max_stack_size, this->assigner.get_stack_size());
1766 if (
auto *sym = std::get_if<SymRef>(&target)) {
1767 this->compiler.text_writer.ensure_space(16);
1768 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1769 this->compiler.reloc_text(
1770 *sym, elf::R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1772 ValuePart &tvp = std::get<ValuePart>(target);
1773 if (tvp.has_assignment() && !tvp.assignment().register_valid()) {
1774 assert(tvp.assignment().stack_valid());
1775 auto off = tvp.assignment().frame_off();
1776 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1777 }
else if (tvp.can_salvage()) {
1778 ASMC(&this->compiler, CALLr, tvp.salvage(&this->compiler));
1780 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1781 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1782 ASMC(&this->compiler, CALLr, reg);
1784 tvp.reset(&this->compiler);
1787 if (stack_adjust_off != 0) {
1788 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1792template <IRAdaptor Adaptor,
1794 template <
typename,
typename,
typename>
typename BaseTy,
1797 std::variant<SymRef, ValuePart> &&target,
1798 std::span<CallArg> arguments,
1799 typename Base::ValueRef *result,
1800 const bool variable_args) {
1803 for (
auto &arg : arguments) {
1804 cb.add_arg(std::move(arg));
1806 cb.call(std::move(target));
1808 cb.add_ret(*result);
1814 template <
typename,
typename,
typename>
typename BaseTy,
1816void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1817 const AsmReg cmp_reg,
1818 const AsmReg tmp_reg,
1819 const u64 case_value,
1820 const bool width_is_32)
noexcept {
1822 ASM(CMP32ri, cmp_reg, case_value);
1824 if ((i64)((i32)case_value) == (i64)case_value) {
1825 ASM(CMP64ri, cmp_reg, case_value);
1828 ASM(CMP64rr, cmp_reg, tmp_reg);
1833template <IRAdaptor Adaptor,
1835 template <
typename,
typename,
typename>
typename BaseTy,
1837void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1838 const Label case_label,
1839 const AsmReg cmp_reg,
1840 const AsmReg tmp_reg,
1841 const u64 case_value,
1842 const bool width_is_32)
noexcept {
1843 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1847template <IRAdaptor Adaptor,
1849 template <
typename,
typename,
typename>
typename BaseTy,
1851bool CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_jump_table(
1852 Label default_label,
1853 std::span<const Label> labels,
1858 bool width_is_32)
noexcept {
1860 if (low_bound != 0) {
1861 switch_emit_cmp(cmp_reg, tmp_reg, low_bound, width_is_32);
1864 switch_emit_cmp(cmp_reg, tmp_reg, high_bound, width_is_32);
1869 ASM(MOV32rr, cmp_reg, cmp_reg);
1872 if (low_bound != 0) {
1873 if (i32(low_bound) == i64(low_bound)) {
1874 ASM(SUB64ri, cmp_reg, low_bound);
1877 ASM(SUB64rr, cmp_reg, tmp_reg);
1881 Label jump_table = this->text_writer.label_create();
1882 ASM(LEA64rm, tmp_reg, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1884 this->text_writer.label_ref(jump_table,
1885 this->text_writer.offset() - 4,
1886 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1888 ASM(MOVSXr64m32, cmp_reg, FE_MEM(tmp_reg, 4, cmp_reg, 0));
1889 ASM(ADD64rr, tmp_reg, cmp_reg);
1892 this->text_writer.align(4);
1893 this->text_writer.ensure_space(4 + 4 * labels.size());
1895 const u32 table_off = this->text_writer.offset();
1896 for (u32 i = 0; i < labels.size(); i++) {
1897 if (this->text_writer.label_is_pending(labels[i])) {
1898 this->text_writer.label_ref(labels[i],
1899 this->text_writer.offset(),
1900 LabelFixupKind::X64_JUMP_TABLE);
1901 this->text_writer.write(table_off);
1903 const auto label_off = this->text_writer.label_offset(labels[i]);
1904 this->text_writer.write((i32)label_off - (i32)table_off);
1910template <IRAdaptor Adaptor,
1912 template <
typename,
typename,
typename>
typename BaseTy,
1914void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
1915 const Label case_label,
1916 const Label gt_label,
1917 const AsmReg cmp_reg,
1918 const AsmReg tmp_reg,
1919 const u64 case_value,
1920 const bool width_is_32)
noexcept {
1921 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
1925template <IRAdaptor Adaptor,
1927 template <
typename,
typename,
typename>
typename BaseTy,
1929CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1931 SymRef sym, TLSModel model)
noexcept {
1934 case TLSModel::GlobalDynamic: {
1937 assert(!this->stack.is_leaf_function);
1938 assert(may_clobber_flags());
1939 this->stack.generated_call =
true;
1940 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1941 for (
auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1944 ScratchReg arg{
this};
1945 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1949 this->text_writer.ensure_space(0x10);
1950 *this->text_writer.cur_ptr()++ = 0x66;
1951 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1953 sym, elf::R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1954 *this->text_writer.cur_ptr()++ = 0x66;
1955 *this->text_writer.cur_ptr()++ = 0x66;
1956 *this->text_writer.cur_ptr()++ = 0x48;
1957 ASMNC(CALL, this->text_writer.cur_ptr());
1958 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1959 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1962 this->reloc_text(this->sym_tls_get_addr,
1963 elf::R_X86_64_PLT32,
1964 this->text_writer.offset() - 4,
1968 ScratchReg res{
this};
1969 res.alloc_specific(AsmReg::AX);
@ LOCAL
Symbol with local linkage, must be defined.
Helper class to write function text.
x86-64 System V calling convention.
Helper class for building call sequences.
CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
Constructor.
Helper class to write function text for X64.
The IRAdaptor specifies the interface with which the IR-independent parts of the compiler interact wi...
void label_place(Label label) noexcept
void evict_reg(Reg reg) noexcept
Compiler mixin for targeting x86-64.
void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept
Move src into dst if cc is true, otherwise do nothing.
void prologue_end(CCAssigner *cc_assigner) noexcept
Finish prologue.
void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept
Bitfield insert in zero.
ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept
Generate code sequence to load address of sym into a register.
void generate_raw_intext(AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept
Integer extension.
void alloca_dynamic(u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept
Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
void generate_raw_jump(Jump jmp, Label target) noexcept
Generate jump instruction to target label.
void generate_raw_set(Jump cc, AsmReg dst, bool zext=true) noexcept
Set dst to 1 if cc is true, otherwise set it to zero.
void generate_raw_mask(Jump cc, AsmReg dst) noexcept
Set all bits of dst to 1 if cc is true, otherwise set it to zero.
void prologue_begin(CCAssigner *cc_assigner) noexcept
Begin prologue, prepare for assigning arguments.
bool preserve_flags
Whether flags must be preserved when materializing constants etc.
void materialize_constant(const u64 *data, RegBank bank, u32 size, AsmReg dst) noexcept
Materialize constant into a register.
void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept
Bitfield insert. Needs a temporary register, src is not modified.
std::optional< i32 > prologue_assign_arg_part(ValuePart &&vp, CCAssignment cca) noexcept
Assign argument part.
void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept
Dynamic alloca of a fixed-size region.
u32 scalar_arg_count
For vararg functions only: number of scalar and xmm registers used.
SymRef sym_tls_get_addr
Symbol for __tls_get_addr.
@ jg
Jump if greater (ZF=0 and SF=OF).
@ jno
Jump if not overflow (OF=0).
@ jnp
Jump if parity odd (PF=0).
@ jns
Jump if not sign (SF=0).
@ jl
Jump if less (SF!=OF).
@ jp
Jump if parity even (PF=1).
@ jle
Jump if less or equal (ZF=1 or SF!=OF).
@ jo
Jump if overflow (OF=1).
@ je
Jump if equal/if zero (ZF=1).
@ jge
Jump if greater or equal (SF=OF).
@ jae
Jump if above or equal/if not carry (CF=0).
@ ja
Jump if above (CF=0 and ZF=0).
@ jne
Jump if not equal/if not zero (ZF=0).
@ jb
Jump if below/if carry (CF=1).
@ jbe
Jump if below or equal (CF=1 or ZF=1).
u32 max_callee_stack_arg_size
For functions without dynamic allocas, the largest size used for arguments passed on the stack to cal...
void generate_call(std::variant< SymRef, ValuePart > &&target, std::span< CallArg > arguments, typename Base::ValueRef *result, bool variable_args=false)
Generate a function call.