6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/FunctionWriter.hpp"
10#include "tpde/base.hpp"
11#include "tpde/x64/FunctionWriterX64.hpp"
16#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
17 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
22#define ASM_FULL(compiler, reserve, op, ...) \
23 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
25#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
26#define ASMC(compiler, op, ...) \
27 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
28#define ASMF(op, flag, ...) \
29 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
30#define ASMNCF(op, flag, ...) \
31 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
32#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
74 constexpr explicit AsmReg() : Reg((u8)0xFF) {}
76 constexpr AsmReg(
const REG
id) : Reg((u8)id) {}
78 constexpr AsmReg(
const Reg base) : Reg(base) {}
80 constexpr explicit AsmReg(
const u64
id) : Reg(id) {
81 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
84 constexpr operator FeRegGP()
const {
85 assert(reg_id <= R15);
86 return FeRegGP{reg_id};
89 operator FeRegGPLH()
const {
90 assert(reg_id <= R15);
91 return FeRegGP{reg_id};
94 constexpr operator FeRegXMM()
const {
95 assert(reg_id >= XMM0 && reg_id <= XMM15);
96 return FeRegXMM{
static_cast<u8
>(reg_id & 0x1F)};
101 create_bitmask(
const std::initializer_list<AsmReg::REG> regs) {
103 for (
const auto reg : regs) {
110constexpr static u64 create_bitmask(
const std::array<AsmReg, N> regs) {
112 for (
const auto reg : regs) {
113 set |= 1ull << reg.id();
119class CCAssignerSysV :
public CCAssigner {
121 static constexpr CCInfo Info{
123 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
124 .callee_saved_regs = create_bitmask({
131 .arg_regs = create_bitmask({
147 .red_zone_size = 128,
151 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
153 unsigned must_assign_stack = 0;
155 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
158 CCAssignerSysV(
bool vararg =
false) : CCAssigner(Info), vararg(vararg) {}
160 void reset()
override {
161 gp_cnt = xmm_cnt = stack = 0;
162 must_assign_stack = 0;
164 ret_gp_cnt = ret_xmm_cnt = 0;
167 void assign_arg(CCAssignment &arg)
override {
169 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
170 arg.stack_off = stack;
175 if (arg.bank == RegBank{0}) {
176 static constexpr std::array<AsmReg, 6> gp_arg_regs{
184 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
185 arg.reg = gp_arg_regs[gp_cnt];
190 must_assign_stack = arg.consecutive + 1;
191 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
192 arg.stack_off = stack;
195 }
else if (arg.bank == RegBank{1}) {
196 if (!must_assign_stack && xmm_cnt < 8) {
197 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
202 must_assign_stack = arg.consecutive + 1;
203 u32 size = util::align_up(arg.size, 8);
204 stack = util::align_up(stack, size);
205 arg.stack_off = stack;
210 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
211 arg.stack_off = stack;
212 stack += util::align_up(arg.size, 8);
215 if (must_assign_stack > 0) {
216 must_assign_stack -= 1;
220 u32 get_stack_size()
override {
return stack; }
222 bool is_vararg()
const override {
return vararg; }
224 void assign_ret(CCAssignment &arg)
override {
225 assert(!arg.byval && !arg.sret);
226 if (arg.bank == RegBank{0}) {
227 if (ret_gp_cnt + arg.consecutive < 2) {
228 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
231 TPDE_UNREACHABLE(
"too many return values");
233 }
else if (arg.bank == RegBank{1}) {
234 if (ret_xmm_cnt + arg.consecutive < 2) {
235 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
238 TPDE_UNREACHABLE(
"too many return values");
241 TPDE_UNREACHABLE(
"return value must have valid register bank");
246struct PlatformConfig : CompilerConfigDefault {
247 using Assembler = tpde::elf::AssemblerElfX64;
248 using AsmReg = tpde::x64::AsmReg;
252 static constexpr RegBank GP_BANK{0};
253 static constexpr RegBank FP_BANK{1};
254 static constexpr bool FRAME_INDEXING_NEGATIVE =
true;
255 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
256 static constexpr u32 NUM_BANKS = 2;
260template <IRAdaptor Adaptor,
262 template <
typename,
typename,
typename>
typename BaseTy =
264 typename Config = PlatformConfig>
265struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
266 using Base = BaseTy<Adaptor, Derived, Config>;
268 using IRValueRef =
typename Base::IRValueRef;
269 using IRBlockRef =
typename Base::IRBlockRef;
270 using IRFuncRef =
typename Base::IRFuncRef;
272 using ScratchReg =
typename Base::ScratchReg;
273 using ValuePartRef =
typename Base::ValuePartRef;
274 using ValuePart =
typename Base::ValuePart;
275 using GenericValuePart =
typename Base::GenericValuePart;
277 using RegisterFile =
typename Base::RegisterFile;
279 using CallArg =
typename Base::CallArg;
286 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
289 static constexpr u32 MaxStaticAllocaSize = 0x10000000;
291 enum CPU_FEATURES : u32 {
293 CPU_CMPXCHG16B = (1 << 0),
294 CPU_POPCNT = (1 << 1),
296 CPU_SSSE3 = (1 << 3),
297 CPU_SSE4_1 = (1 << 4),
298 CPU_SSE4_2 = (1 << 5),
303 CPU_F16C = (1 << 10),
305 CPU_LZCNT = (1 << 12),
306 CPU_MOVBE = (1 << 13),
307 CPU_AVX512F = (1 << 14),
308 CPU_AVX512BW = (1 << 15),
309 CPU_AVX512CD = (1 << 16),
310 CPU_AVX512DQ = (1 << 17),
311 CPU_AVX512VL = (1 << 18),
313 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
314 CPU_SSE4_1 | CPU_SSE4_2,
315 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
316 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
317 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
321 CPU_FEATURES cpu_feats = CPU_BASELINE;
330 u64 fixed_assignment_nonallocatable_mask =
331 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
332 u32 func_start_off = 0u, func_prologue_alloc = 0u;
336 u32 reg_save_frame_off = 0;
337 u32 var_arg_stack_off = 0;
338 util::SmallVector<u32, 8> func_ret_offs = {};
352 class CallBuilder :
public Base::template CallBuilderBase<CallBuilder> {
353 u32 stack_adjust_off = 0;
355 void set_stack_used();
360 : Base::template CallBuilderBase<
CallBuilder>(compiler, assigner) {}
362 void add_arg_byval(ValuePart &vp, CCAssignment &cca);
363 void add_arg_stack(ValuePart &vp, CCAssignment &cca);
364 void call_impl(std::variant<SymRef, ValuePart> &&target);
370 const CPU_FEATURES cpu_features = CPU_BASELINE)
371 : Base{adaptor}, cpu_feats(cpu_features) {
372 static_assert(std::is_base_of_v<CompilerX64, Derived>);
375 template <
typename... Args>
376 auto asm_helper(
unsigned (*enc_fn)(u8 *,
int, Args...)) {
378 CompilerX64 *compiler;
380 void encode(
unsigned reserve,
int flags, Args... args) {
382 compiler->text_writer.ensure_space(reserve);
384 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
386 compiler->text_writer.cur_ptr() += n;
389 return Helper{
this, enc_fn};
392 void start_func(u32 func_idx);
402 void finish_func(u32 func_idx);
408 void gen_func_epilog();
410 void set_preserve_flags(
bool preserve) {
preserve_flags = preserve; }
413 void spill_reg(
const AsmReg reg,
const i32 frame_off,
const u32 size);
415 void load_from_stack(AsmReg dst,
418 bool sign_extend =
false);
420 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap);
422 void mov(AsmReg dst, AsmReg src, u32 size);
424 GenericValuePart val_spill_slot(AssignmentPartRef ap) {
425 assert(ap.stack_valid() && !ap.variable_ref());
426 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
429 AsmReg gval_expr_as_reg(GenericValuePart &gv);
445 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef);
505 std::span<CallArg> arguments,
506 typename Base::ValueRef *result,
507 bool variable_args =
false);
511 void switch_emit_cmp(AsmReg cmp_reg,
518 void switch_emit_cmpeq(Label case_label,
524 FunctionWriterBase::JumpTable *switch_create_jump_table(Label default_label,
531 void switch_emit_binary_step(Label case_label,
542 bool has_cpu_feats(CPU_FEATURES feats)
const {
543 return ((cpu_feats & feats) == feats);
547template <IRAdaptor Adaptor,
549 template <
typename,
typename,
typename>
class BaseTy,
551void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
553 this->preserve_flags =
false;
556template <IRAdaptor Adaptor,
558 template <
typename,
typename,
typename>
typename BaseTy,
561 CCAssigner *cc_assigner) {
562 func_ret_offs.clear();
563 func_start_off = this->text_writer.offset();
566 const CCInfo &cc_info = cc_assigner->get_ccinfo();
568 auto csr = cc_info.callee_saved_regs;
569 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
570 "non-gp callee-saved registers not implemented");
572 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
573 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
575 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
576 this->stack.frame_size = 8 * (csr_logp + csr_higp);
580 func_prologue_alloc = reg_save_size + 11;
581 this->text_writer.ensure_space(func_prologue_alloc);
582 this->text_writer.cur_ptr() += func_prologue_alloc;
586 if (this->adaptor->cur_is_vararg()) {
587 this->stack.frame_used =
true;
588 this->stack.frame_size += 6 * 8 + 8 * 16;
589 reg_save_frame_off = this->stack.frame_size;
590 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
591 ASM(MOV64mr, mem, FE_DI);
593 ASM(MOV64mr, mem, FE_SI);
595 ASM(MOV64mr, mem, FE_DX);
597 ASM(MOV64mr, mem, FE_CX);
599 ASM(MOV64mr, mem, FE_R8);
601 ASM(MOV64mr, mem, FE_R9);
602 auto skip_fp = this->text_writer.label_create();
603 ASM(TEST8rr, FE_AX, FE_AX);
606 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
608 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
610 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
612 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
614 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
616 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
618 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
620 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
627 template <
typename,
typename,
typename>
typename BaseTy,
631 ValuePart &&vp, CCAssignment cca) {
632 if (cca.reg.valid()) [[likely]] {
633 vp.set_value_reg(
this, cca.reg);
637 this->register_file.allocatable |= u64{1} << cca.reg.id();
646 this->stack.frame_used =
true;
647 i32 frame_off = 0x10 + cca.stack_off;
650 }
else if (vp.assignment().assignment()->part_count == 1 &&
651 !vp.assignment().register_valid()) {
657 vp.assignment().set_stack_valid();
658 vp.assignment().assignment()->frame_off = frame_off;
660 AsmReg dst = vp.alloc_reg(
this);
661 this->load_from_stack(dst, frame_off, cca.size);
668 template <
typename,
typename,
typename>
typename BaseTy,
671 CCAssigner *cc_assigner) {
672 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
674 const CCInfo &cc_info = cc_assigner->get_ccinfo();
675 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
676 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
677 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
678 this->scalar_arg_count = std::popcount(gp_regs);
679 this->vec_arg_count = std::popcount(xmm_regs);
680 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
686 template <
typename,
typename,
typename>
typename BaseTy,
688void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(u32 func_idx) {
689 const CCInfo &ccinfo = derived()->cur_cc_assigner()->get_ccinfo();
690 auto csr = ccinfo.callee_saved_regs;
691 u64 saved_regs = this->register_file.clobbered & csr;
693 bool needs_stack_frame = this->stack.frame_used ||
694 this->stack.generated_call ||
695 this->stack.has_dynamic_alloca || saved_regs != 0;
697 u32 prologue_size = 0;
698 u32 num_saved_regs = 0;
699 u32 rsp_adjustment = 0;
702 this->text_writer.eh_begin_fde(this->get_personality_sym());
704 if (needs_stack_frame) {
705 if (!func_ret_offs.empty()) {
706 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
709 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
710 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
711 this->text_writer.eh_write_inst(
712 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
714 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
715 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
716 dwarf::x64::DW_reg_rbp);
719 auto fde_prologue_adv_off = this->text_writer.eh_writer.size();
720 if (saved_regs != 0) {
721 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
724 auto *write_ptr = this->text_writer.begin_ptr() + func_start_off;
725 write_ptr += fe64_PUSHr(write_ptr, 0, FE_BP);
726 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_BP, FE_SP);
727 for (
auto reg : util::BitSetIterator{saved_regs}) {
728 assert(reg <= AsmReg::R15);
730 fe64_PUSHr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
736 static const u8 gpreg_to_dwarf[] = {
737 dwarf::x64::DW_reg_rax,
738 dwarf::x64::DW_reg_rcx,
739 dwarf::x64::DW_reg_rdx,
740 dwarf::x64::DW_reg_rbx,
741 dwarf::x64::DW_reg_rsp,
742 dwarf::x64::DW_reg_rbp,
743 dwarf::x64::DW_reg_rsi,
744 dwarf::x64::DW_reg_rdi,
745 dwarf::x64::DW_reg_r8,
746 dwarf::x64::DW_reg_r9,
747 dwarf::x64::DW_reg_r10,
748 dwarf::x64::DW_reg_r11,
749 dwarf::x64::DW_reg_r12,
750 dwarf::x64::DW_reg_r13,
751 dwarf::x64::DW_reg_r14,
752 dwarf::x64::DW_reg_r15,
754 u8 dwarf_reg = gpreg_to_dwarf[reg];
755 auto cfa_off = num_saved_regs + 2;
756 this->text_writer.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
760 (!this->stack.has_dynamic_alloca || max_callee_stack_arg_size == 0) &&
761 "stack with dynamic alloca must adjust stack pointer at call sites");
764 u32 final_frame_size =
765 util::align_up(this->stack.frame_size + max_callee_stack_arg_size, 16);
766 rsp_adjustment = final_frame_size - num_saved_regs * 8;
767 bool needs_rsp_adjustment = this->stack.generated_call ||
768 this->stack.has_dynamic_alloca ||
769 rsp_adjustment > ccinfo.red_zone_size;
771 if (needs_rsp_adjustment) {
772 write_ptr += fe64_SUB64ri(write_ptr, 0, FE_SP, rsp_adjustment);
778 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
779 assert(prologue_size <= func_prologue_alloc);
780 if (saved_regs != 0) {
781 assert(prologue_size < 0x44 &&
"cannot encode too large prologue in CFI");
782 this->text_writer.eh_writer.data()[fde_prologue_adv_off] =
783 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
787 if (!func_ret_offs.empty()) {
788 u8 *text_data = this->text_writer.begin_ptr();
789 if (func_ret_offs.back() == this->text_writer.offset() - 5) {
790 this->text_writer.cur_ptr() -= 5;
791 func_ret_offs.pop_back();
793 for (
auto ret_off : func_ret_offs) {
794 fe64_JMP(text_data + ret_off, FE_JMPL, this->text_writer.cur_ptr());
800 this->text_writer.ensure_space(prologue_size + 1);
801 if (needs_stack_frame) {
802 if (this->stack.has_dynamic_alloca) {
803 if (num_saved_regs == 0) {
804 ASMNC(MOV64rr, FE_SP, FE_BP);
806 i32 reg_save_size = num_saved_regs * 8;
807 ASMNC(LEA64rm, FE_SP, FE_MEM(FE_BP, 0, FE_NOREG, -reg_save_size));
809 }
else if (rsp_adjustment != 0) {
810 ASMNC(ADD64ri, FE_SP, rsp_adjustment);
814 assert(ccinfo.red_zone_size >= num_saved_regs * 8 &&
815 "unwind info incorrect for calling conv without red zone");
816 for (
auto reg : util::BitSetIterator<true>{saved_regs}) {
817 ASMNC(POPr, AsmReg(reg));
821 u32 body_start = func_start_off + func_prologue_alloc;
822 this->text_writer.eh_advance(this->text_writer.offset() - body_start);
823 this->text_writer.eh_write_inst(dwarf::DW_CFA_restore_state);
831 this->text_writer.remove_prologue_bytes(func_start_off + prologue_size,
832 func_prologue_alloc - prologue_size);
833 auto func_size = this->text_writer.offset() - func_start_off;
834 auto func_sym = this->func_syms[func_idx];
835 auto func_sec = this->text_writer.get_sec_ref();
836 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
837 this->text_writer.eh_end_fde();
838 this->text_writer.except_encode_func();
841template <IRAdaptor Adaptor,
843 template <
typename,
typename,
typename>
typename BaseTy,
845void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() {
846 func_ret_offs.clear();
847 sym_tls_get_addr = {};
851template <IRAdaptor Adaptor,
853 template <
typename,
typename,
typename>
typename BaseTy,
855void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() {
857 func_ret_offs.push_back(this->text_writer.offset());
858 this->text_writer.ensure_space(5);
859 this->text_writer.cur_ptr() += 5;
862template <IRAdaptor Adaptor,
864 template <
typename,
typename,
typename>
typename BaseTy,
866void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
867 const AsmReg reg,
const i32 frame_off,
const u32 size) {
868 assert(this->stack.frame_used);
869 this->text_writer.ensure_space(16);
870 assert(frame_off < 0);
871 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
872 if (reg.id() <= AsmReg::R15) {
874 case 1: ASMNC(MOV8mr, mem, reg);
break;
875 case 2: ASMNC(MOV16mr, mem, reg);
break;
876 case 4: ASMNC(MOV32mr, mem, reg);
break;
877 case 8: ASMNC(MOV64mr, mem, reg);
break;
878 default: TPDE_UNREACHABLE(
"invalid spill size");
884 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg);
break;
885 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg);
break;
886 case 16: ASMNC(SSE_MOVAPDmr, mem, reg);
break;
887 default: TPDE_UNREACHABLE(
"invalid spill size");
891template <IRAdaptor Adaptor,
893 template <
typename,
typename,
typename>
typename BaseTy,
895void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
899 const bool sign_extend) {
900 assert(this->stack.frame_used);
901 this->text_writer.ensure_space(16);
902 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
904 if (dst.id() <= AsmReg::R15) {
907 case 1: ASMNC(MOVZXr32m8, dst, mem);
break;
908 case 2: ASMNC(MOVZXr32m16, dst, mem);
break;
909 case 4: ASMNC(MOV32rm, dst, mem);
break;
910 case 8: ASMNC(MOV64rm, dst, mem);
break;
911 default: TPDE_UNREACHABLE(
"invalid spill size");
915 case 1: ASMNC(MOVSXr64m8, dst, mem);
break;
916 case 2: ASMNC(MOVSXr64m16, dst, mem);
break;
917 case 4: ASMNC(MOVSXr64m32, dst, mem);
break;
918 case 8: ASMNC(MOV64rm, dst, mem);
break;
919 default: TPDE_UNREACHABLE(
"invalid spill size");
925 assert(!sign_extend);
928 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem);
break;
929 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem);
break;
930 case 16: ASMNC(SSE_MOVAPDrm, dst, mem);
break;
931 default: TPDE_UNREACHABLE(
"invalid spill size");
935template <IRAdaptor Adaptor,
937 template <
typename,
typename,
typename>
typename BaseTy,
939void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
940 const AsmReg dst,
const AssignmentPartRef ap) {
941 assert(this->stack.frame_used);
942 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
945template <IRAdaptor Adaptor,
947 template <
typename,
typename,
typename>
typename BaseTy,
949void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
const AsmReg dst,
952 this->text_writer.ensure_space(16);
955 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
957 ASMNC(MOV64rr, dst, src);
959 ASMNC(MOV32rr, dst, src);
961 }
else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
963 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
964 assert(has_cpu_feats(CPU_AVX512F));
965 ASMNC(VMOVAPD128rr, dst, src);
967 ASMNC(SSE_MOVAPDrr, dst, src);
969 }
else if (size <= 32) {
970 assert(has_cpu_feats(CPU_AVX));
971 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
972 has_cpu_feats(CPU_AVX512F));
973 ASMNC(VMOVAPD256rr, dst, src);
976 assert(has_cpu_feats(CPU_AVX512F));
977 ASMNC(VMOVAPD512rr, dst, src);
979 }
else if (dst.id() <= AsmReg::R15) {
981 assert(src.id() >= AsmReg::XMM0);
983 if (src.id() > AsmReg::XMM15) {
984 assert(has_cpu_feats(CPU_AVX512F));
986 ASMNC(VMOVD_X2Grr, dst, src);
988 ASMNC(VMOVQ_X2Grr, dst, src);
992 ASMNC(SSE_MOVD_X2Grr, dst, src);
994 ASMNC(SSE_MOVQ_X2Grr, dst, src);
999 assert(src.id() <= AsmReg::R15);
1000 assert(dst.id() >= AsmReg::XMM0);
1002 if (dst.id() > AsmReg::XMM15) {
1003 assert(has_cpu_feats(CPU_AVX512F));
1005 ASMNC(VMOVD_G2Xrr, dst, src);
1007 ASMNC(VMOVQ_G2Xrr, dst, src);
1011 ASMNC(SSE_MOVD_G2Xrr, dst, src);
1013 ASMNC(SSE_MOVQ_G2Xrr, dst, src);
1019template <IRAdaptor Adaptor,
1021 template <
typename,
typename,
typename>
typename BaseTy,
1023AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1024 GenericValuePart &gv) {
1025 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1027 ScratchReg scratch{derived()};
1028 bool disp32 = i32(expr.disp) == expr.disp;
1029 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1030 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1031 if (std::holds_alternative<ScratchReg>(expr.base)) {
1032 scratch = std::move(std::get<ScratchReg>(expr.base));
1033 }
else if (std::holds_alternative<ScratchReg>(expr.index)) {
1034 scratch = std::move(std::get<ScratchReg>(expr.index));
1036 (void)scratch.alloc_gp();
1038 auto dst = scratch.cur_reg();
1040 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1042 if (base.valid() && disp32) {
1043 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1045 }
else if (base.valid()) {
1046 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1047 }
else if (disp32) {
1048 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1050 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1053 assert(may_clobber_flags());
1054 u64 scale = expr.scale;
1056 base = AsmReg::make_invalid();
1060 ScratchReg idx_scratch{derived()};
1062 AsmReg idx_tmp = dst;
1063 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1066 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1067 }
else if (dst == base) {
1068 idx_tmp = idx_scratch.alloc_gp();
1071 if ((scale & (scale - 1)) == 0) {
1072 if (idx_tmp != idx) {
1073 ASM(MOV64rr, idx_tmp, idx);
1075 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1077 if (i32(scale) == i64(scale)) {
1078 ASM(IMUL64rri, idx_tmp, idx, scale);
1080 ScratchReg scratch2{derived()};
1081 auto tmp2 = scratch2.alloc_gp();
1082 ASM(MOV64ri, tmp2, scale);
1083 if (idx_tmp != idx) {
1084 ASM(MOV64rr, idx_tmp, idx);
1086 ASM(IMUL64rr, idx_tmp, tmp2);
1090 if (disp32 || (idx_tmp != dst && base != dst)) {
1091 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1093 }
else if (dst == base) {
1094 ASM(ADD64rr, dst, idx_tmp);
1096 ASM(ADD64rr, dst, base);
1100 }
else if (base.valid()) {
1101 if (expr.disp && disp32) {
1102 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1104 }
else if (dst != base) {
1105 ASM(MOV64rr, dst, base);
1109 ScratchReg scratch2{derived()};
1110 auto tmp2 = scratch2.alloc_gp();
1111 ASM(MOV64ri, tmp2, expr.disp);
1112 if (may_clobber_flags()) {
1113 ASM(ADD64rr, dst, tmp2);
1115 ASM(LEA64rm, dst, FE_MEM(dst, 1, tmp2, 0));
1118 gv.state = std::move(scratch);
1122template <IRAdaptor Adaptor,
1124 template <
typename,
typename,
typename>
typename BaseTy,
1127 u64 size, u32 align, ValuePart &res) {
1128 assert(this->stack.has_dynamic_alloca &&
1129 "function marked as not having dynamic allocas can't have alloca");
1130 assert(align != 0 && (align & (align - 1)) == 0 &&
"invalid alignment");
1131 assert(may_clobber_flags());
1132 size = tpde::util::align_up(size, 16);
1134 assert(size < 0x8000'0000);
1135 ASM(SUB64ri, FE_SP, size);
1138 assert(align < u32{1} << 31 &&
"alignment >= 2**31 not implemented");
1139 ASM(AND64ri, FE_SP, ~(align - 1));
1141 ASM(MOV64rr, res.alloc_reg(
this), FE_SP);
1146 template <
typename,
typename,
typename>
typename BaseTy,
1149 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) {
1150 assert(this->stack.has_dynamic_alloca &&
1151 "function marked as not having dynamic allocas can't have alloca");
1152 assert(align != 0 && (align & (align - 1)) == 0 &&
"invalid alignment");
1153 assert(may_clobber_flags());
1154 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(
this);
1155 AsmReg res_reg = res.alloc_try_reuse(
this, count);
1157 if (elem_size == 0) {
1158 ASM(XOR32rr, res_reg, res_reg);
1159 }
else if ((elem_size & (elem_size - 1)) == 0) {
1161 const auto shift = util::cnt_tz(elem_size);
1162 if (shift > 0 && shift < 4) {
1163 ASM(LEA64rm, res_reg, FE_MEM(FE_NOREG, u8(1 << shift), size_reg, 0));
1165 if (size_reg != res_reg) {
1166 ASM(MOV64rr, res_reg, size_reg);
1168 if (elem_size != 1) {
1169 ASM(SHL64ri, res_reg, shift);
1173 if (elem_size <= 0x7FFF'FFFF) [[likely]] {
1174 ASM(IMUL64rri, res_reg, size_reg, elem_size);
1176 ScratchReg scratch{
this};
1177 auto tmp = scratch.alloc_gp();
1178 ASM(MOV64ri, tmp, elem_size);
1179 if (size_reg != res_reg) {
1180 ASM(MOV64rr, res_reg, size_reg);
1182 ASM(IMUL64rr, res_reg, tmp);
1186 ASM(SUB64rr, FE_SP, res_reg);
1188 align = align > 16 ? align : 16;
1189 if (elem_size & (align - 1)) {
1190 assert(align < u32{1} << 31 &&
"alignment >= 2**31 not implemented");
1191 ASM(AND64ri, FE_SP, ~(align - 1));
1194 ASM(MOV64rr, res_reg, FE_SP);
1199 template <
typename,
typename,
typename>
typename BaseTy,
1202 const u64 *data,
const RegBank bank,
const u32 size, AsmReg dst) {
1203 const auto const_u64 = data[0];
1204 if (bank == Config::GP_BANK) {
1206 if (const_u64 == 0) {
1207 if (may_clobber_flags()) {
1208 ASM(XOR32rr, dst, dst);
1210 ASM(MOV32ri, dst, 0);
1215 if (size <= 4 || u32(const_u64) == const_u64) {
1216 ASM(MOV32ri, dst, const_u64);
1218 ASM(MOV64ri, dst, const_u64);
1223 assert(bank == Config::FP_BANK);
1224 const auto high_u64 = size <= 8 ? 0 : data[1];
1225 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1226 if (has_cpu_feats(CPU_AVX)) {
1227 ASM(VPXOR128rrr, dst, dst, dst);
1229 ASM(SSE_PXORrr, dst, dst);
1233 const u64 ones = -u64{1};
1234 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1235 if (has_cpu_feats(CPU_AVX)) {
1236 ASM(VPCMPEQB128rrr, dst, dst, dst);
1238 ASM(SSE_PCMPEQBrr, dst, dst);
1247 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1249 this->register_file.mark_clobbered(tmp);
1252 if (has_cpu_feats(CPU_AVX)) {
1253 ASM(VMOVD_G2Xrr, dst, tmp);
1255 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1258 if (has_cpu_feats(CPU_AVX)) {
1259 ASM(VMOVQ_G2Xrr, dst, tmp);
1261 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1270 auto alloc_size = util::align_up(size, 8);
1271 std::span<const u8> raw_data{
reinterpret_cast<const u8 *
>(data), alloc_size};
1273 auto rodata = this->assembler.get_default_section(SectionKind::ReadOnly);
1274 auto sym = this->assembler.sym_def_data(
1277 if (has_cpu_feats(CPU_AVX)) {
1278 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1280 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1282 }
else if (size <= 8) {
1283 if (has_cpu_feats(CPU_AVX)) {
1284 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1286 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1288 }
else if (size <= 16) {
1289 if (has_cpu_feats(CPU_AVX)) {
1290 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1292 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1296 TPDE_FATAL(
"unable to materialize constant");
1299 this->reloc_text(sym, elf::R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1304 template <
typename,
typename,
typename>
typename BaseTy,
1307 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1308 AssignmentPartRef ap, IRValueRef) {
1309 RegBank bank = ap.bank();
1310 assert(bank.id() <= Config::NUM_BANKS);
1311 auto reg_mask = this->register_file.bank_regs(bank);
1312 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1314 const auto find_possible_regs = [
this,
1315 reg_mask](
const u64 preferred_regs) -> u64 {
1317 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1318 return free_regs & preferred_regs & reg_mask;
1322 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1323 if (!this->stack.is_leaf_function) {
1325 possible_regs = find_possible_regs(csr);
1329 possible_regs = find_possible_regs(~csr);
1330 if (possible_regs == 0) {
1332 possible_regs = find_possible_regs(csr);
1336 if (possible_regs == 0) {
1337 return AsmReg::make_invalid();
1341 if ((possible_regs & ~this->register_file.used) != 0) {
1342 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1345 for (
const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1346 const auto reg = AsmReg{reg_id};
1348 if (this->register_file.is_fixed(reg)) {
1352 const auto local_idx = this->register_file.reg_local_idx(reg);
1353 const auto part = this->register_file.reg_part(reg);
1355 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1358 auto *assignment = this->val_assignment(local_idx);
1359 auto ap = AssignmentPartRef{assignment, part};
1360 if (ap.modified()) {
1367 return AsmReg::make_invalid();
1370template <IRAdaptor Adaptor,
1372 template <
typename,
typename,
typename>
typename BaseTy,
1375 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(Jump jmp) {
1377 case Jump::ja:
return Jump::jbe;
1378 case Jump::jae:
return Jump::jb;
1379 case Jump::jb:
return Jump::jae;
1380 case Jump::jbe:
return Jump::ja;
1381 case Jump::je:
return Jump::jne;
1382 case Jump::jg:
return Jump::jle;
1383 case Jump::jge:
return Jump::jl;
1384 case Jump::jl:
return Jump::jge;
1385 case Jump::jle:
return Jump::jg;
1386 case Jump::jne:
return Jump::je;
1387 case Jump::jno:
return Jump::jo;
1388 case Jump::jo:
return Jump::jno;
1389 case Jump::js:
return Jump::jns;
1390 case Jump::jns:
return Jump::js;
1391 case Jump::jp:
return Jump::jnp;
1392 case Jump::jnp:
return Jump::jp;
1393 default: TPDE_UNREACHABLE(
"invalid jump kind for invert_jump");
1397template <IRAdaptor Adaptor,
1399 template <
typename,
typename,
typename>
class BaseTy,
1402 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(Jump jmp) {
1404 case Jump::ja:
return Jump::jb;
1405 case Jump::jae:
return Jump::jbe;
1406 case Jump::jb:
return Jump::ja;
1407 case Jump::jbe:
return Jump::jae;
1408 case Jump::je:
return Jump::je;
1409 case Jump::jne:
return Jump::jne;
1410 case Jump::jg:
return Jump::jl;
1411 case Jump::jge:
return Jump::jle;
1412 case Jump::jl:
return Jump::jg;
1413 case Jump::jle:
return Jump::jge;
1414 default: TPDE_UNREACHABLE(
"invalid jump kind for swap_jump");
1418template <IRAdaptor Adaptor,
1420 template <
typename,
typename,
typename>
class BaseTy,
1422FeCond CompilerX64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(Jump jmp) {
1424 FeCond res = FeCond(u32(jmp) << 16);
1426 case Jump::ja: assert(res == FE_CC_A &&
"FeCond value mismatch?");
break;
1427 case Jump::jae: assert(res == FE_CC_AE &&
"FeCond value mismatch?");
break;
1428 case Jump::jb: assert(res == FE_CC_B &&
"FeCond value mismatch?");
break;
1429 case Jump::jbe: assert(res == FE_CC_BE &&
"FeCond value mismatch?");
break;
1430 case Jump::je: assert(res == FE_CC_E &&
"FeCond value mismatch?");
break;
1431 case Jump::jg: assert(res == FE_CC_G &&
"FeCond value mismatch?");
break;
1432 case Jump::jge: assert(res == FE_CC_GE &&
"FeCond value mismatch?");
break;
1433 case Jump::jl: assert(res == FE_CC_L &&
"FeCond value mismatch?");
break;
1434 case Jump::jle: assert(res == FE_CC_LE &&
"FeCond value mismatch?");
break;
1435 case Jump::jne: assert(res == FE_CC_NE &&
"FeCond value mismatch?");
break;
1436 case Jump::jno: assert(res == FE_CC_NO &&
"FeCond value mismatch?");
break;
1437 case Jump::jo: assert(res == FE_CC_O &&
"FeCond value mismatch?");
break;
1438 case Jump::js: assert(res == FE_CC_S &&
"FeCond value mismatch?");
break;
1439 case Jump::jns: assert(res == FE_CC_NS &&
"FeCond value mismatch?");
break;
1440 case Jump::jp: assert(res == FE_CC_P &&
"FeCond value mismatch?");
break;
1441 case Jump::jnp: assert(res == FE_CC_NP &&
"FeCond value mismatch?");
break;
1442 default: TPDE_UNREACHABLE(
"invalid conditional jump");
1447template <IRAdaptor Adaptor,
1449 template <
typename,
typename,
typename>
typename BaseTy,
1452 Jump jmp, Label target_label) {
1453 this->text_writer.ensure_space(6);
1454 bool pending = this->text_writer.label_is_pending(target_label);
1455 void *target = this->text_writer.cur_ptr();
1457 target = this->text_writer.begin_ptr() +
1458 this->text_writer.label_offset(target_label);
1462 ASMNCF(JMP, pending ? FE_JMPL : 0, target);
1464 ASMNCF(Jcc, (pending ? FE_JMPL : 0) | jump_to_cond(
jmp), target);
1468 this->text_writer.label_ref(target_label,
1469 this->text_writer.offset() - 4,
1470 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1476 template <
typename,
typename,
typename>
class BaseTy,
1479 Jump cc, AsmReg dst,
bool zext) {
1481 ASM(MOV32ri, dst, 0);
1483 ASMF(SETcc8r, jump_to_cond(cc), dst);
1488 template <
typename,
typename,
typename>
class BaseTy,
1491 Jump cc, AsmReg dst) {
1498 template <
typename,
typename,
typename>
class BaseTy,
1501 Jump cc, AsmReg dst, AsmReg src,
bool is_64) {
1503 ASMF(CMOVcc64rr, jump_to_cond(cc), dst, src);
1505 ASMF(CMOVcc32rr, jump_to_cond(cc), dst, src);
1511 template <
typename,
typename,
typename>
class BaseTy,
1514 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to) {
1515 assert(from < to && to <= 64);
1516 assert(may_clobber_flags());
1519 case 8: ASM(MOVZXr32r8, dst, src);
break;
1520 case 16: ASM(MOVZXr32r16, dst, src);
break;
1521 case 32: ASM(MOV32rr, dst, src);
break;
1525 ASM(MOV32rr, dst, src);
1527 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1528 }
else if (dst != src) {
1529 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1530 ASM(AND64rr, dst, src);
1532 ScratchReg tmp{
this};
1533 AsmReg tmp_reg = tmp.alloc_gp();
1534 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1535 ASM(AND64rr, dst, tmp_reg);
1538 }
else if (to <= 32) {
1540 case 8: ASM(MOVSXr32r8, dst, src);
break;
1541 case 16: ASM(MOVSXr32r16, dst, src);
break;
1544 ASM(MOV32rr, dst, src);
1547 ASM(AND32ri, dst, 1);
1550 ASM(SHL32ri, dst, 32 - from);
1551 ASM(SAR32ri, dst, 32 - from);
1556 case 8: ASM(MOVSXr64r8, dst, src);
break;
1557 case 16: ASM(MOVSXr64r16, dst, src);
break;
1558 case 32: ASM(MOVSXr64r32, dst, src);
break;
1562 ASM(MOV32rr, dst, src);
1564 ASM(MOV64rr, dst, src);
1568 ASM(AND32ri, dst, 1);
1571 ASM(SHL64ri, dst, 64 - from);
1572 ASM(SAR64ri, dst, 64 - from);
1580 template <
typename,
typename,
typename>
class BaseTy,
1583 AsmReg dst, AsmReg src, u32 lsb, u32 width) {
1584 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1585 assert(may_clobber_flags());
1586 ScratchReg tmp1{
this};
1587 AsmReg tmp1_reg = tmp1.alloc_gp();
1590 ASM(BTR64ri, dst, lsb);
1591 }
else if (lsb + width <= 31) {
1592 ASM(AND64ri, dst, ~(((u64{1} << width) - 1) << lsb));
1594 ASM(MOV64ri, tmp1_reg, ~(((u64{1} << width) - 1) << lsb));
1595 ASM(AND64rr, dst, tmp1_reg);
1599 ASM(MOVZXr32r8, tmp1_reg, src);
1600 }
else if (width == 16) {
1601 ASM(MOVZXr32r16, tmp1_reg, src);
1602 }
else if (width <= 32) {
1603 ASM(MOV32rr, tmp1_reg, src);
1605 ASM(AND32ri, tmp1_reg, (u32{1} << width) - 1);
1608 ASM(MOV64ri, tmp1_reg, (u64{1} << width) - 1);
1609 ASM(AND64rr, tmp1_reg, src);
1612 if (lsb >= 1 && lsb <= 3) {
1613 ASM(LEA64rm, dst, FE_MEM(dst, u8(1 << lsb), tmp1_reg, 0));
1615 if (lsb > 0 && lsb + width <= 32) {
1616 ASM(SHL32ri, tmp1_reg, lsb);
1617 }
else if (lsb > 0) {
1618 ASM(SHL64ri, tmp1_reg, lsb);
1620 ASM(OR64rr, dst, tmp1_reg);
1626 template <
typename,
typename,
typename>
class BaseTy,
1629 AsmReg dst, AsmReg src, u32 lsb, u32 width) {
1630 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1632 assert(may_clobber_flags());
1635 ASM(MOVZXr32r8, dst, src);
1636 }
else if (width == 16) {
1637 ASM(MOVZXr32r16, dst, src);
1638 }
else if (width <= 32) {
1639 ASM(MOV32rr, dst, src);
1641 ASM(AND32ri, dst, (u32{1} << width) - 1);
1644 ASM(MOV64ri, dst, (u64{1} << width) - 1);
1645 ASM(AND64rr, dst, src);
1648 if (lsb > 0 && lsb + width <= 32) {
1649 ASM(SHL32ri, dst, lsb);
1650 }
else if (lsb > 0) {
1651 ASM(SHL64ri, dst, lsb);
1657 template <
typename,
typename,
typename>
class BaseTy,
1661 if (this->compiler.stack.has_dynamic_alloca && stack_adjust_off == 0) {
1662 stack_adjust_off = this->compiler.text_writer.offset();
1664 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1665 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1669template <IRAdaptor Adaptor,
1671 template <
typename,
typename,
typename>
class BaseTy,
1673void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1674 ValuePart &vp, CCAssignment &cca) {
1675 AsmReg ptr = vp.load_to_reg(&this->compiler);
1676 ScratchReg scratch{&this->compiler};
1677 AsmReg tmp = scratch.alloc_gp();
1679 auto size = cca.size;
1682 i32 soff = cca.stack_off;
1684 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1685 ASMC(&this->compiler, MOV64mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1690 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1691 ASMC(&this->compiler, MOV32mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1696 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1697 ASMC(&this->compiler, MOV16mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1702 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1703 ASMC(&this->compiler, MOV8mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1707template <IRAdaptor Adaptor,
1709 template <
typename,
typename,
typename>
class BaseTy,
1711void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1712 ValuePart &vp, CCAssignment &cca) {
1715 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
1716 FeMem mem_op = FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off));
1717 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1719 case 1: ASMC(&this->compiler, MOV8mr, mem_op, reg);
break;
1720 case 2: ASMC(&this->compiler, MOV16mr, mem_op, reg);
break;
1721 case 4: ASMC(&this->compiler, MOV32mr, mem_op, reg);
break;
1722 case 8: ASMC(&this->compiler, MOV64mr, mem_op, reg);
break;
1723 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1726 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1728 case 4: ASMC(&this->compiler, SSE_MOVSSmr, mem_op, reg);
break;
1729 case 8: ASMC(&this->compiler, SSE_MOVSDmr, mem_op, reg);
break;
1730 case 16: ASMC(&this->compiler, SSE_MOVDQAmr, mem_op, reg);
break;
1731 default: TPDE_UNREACHABLE(
"invalid SSE reg size");
1736template <IRAdaptor Adaptor,
1738 template <
typename,
typename,
typename>
class BaseTy,
1740void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1741 std::variant<SymRef, ValuePart> &&target) {
1742 if (this->assigner.is_vararg()) {
1743 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1744 this->compiler.evict_reg(Reg{AsmReg::AX});
1746 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1747 Config::FP_BANK, 0);
1748 unsigned xmm_cnt = 8;
1749 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1750 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1753 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1755 ASMC(&this->compiler, XOR32rr, FE_AX, FE_AX);
1760 if (stack_adjust_off != 0) {
1761 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1762 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1763 memcpy(inst_ptr + 3, &sub,
sizeof(u32));
1765 auto &max_stack_size = this->compiler.max_callee_stack_arg_size;
1766 max_stack_size = std::max(max_stack_size, this->assigner.get_stack_size());
1769 if (
auto *sym = std::get_if<SymRef>(&target)) {
1770 this->compiler.text_writer.ensure_space(16);
1771 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1772 this->compiler.reloc_text(
1773 *sym, elf::R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1775 ValuePart &tvp = std::get<ValuePart>(target);
1776 if (tvp.has_assignment() && !tvp.assignment().register_valid()) {
1777 assert(tvp.assignment().stack_valid());
1778 auto off = tvp.assignment().frame_off();
1779 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1780 }
else if (tvp.can_salvage()) {
1781 ASMC(&this->compiler, CALLr, tvp.salvage(&this->compiler));
1783 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1784 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1785 ASMC(&this->compiler, CALLr, reg);
1787 tvp.reset(&this->compiler);
1790 if (stack_adjust_off != 0) {
1791 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1795template <IRAdaptor Adaptor,
1797 template <
typename,
typename,
typename>
typename BaseTy,
1800 std::variant<SymRef, ValuePart> &&target,
1801 std::span<CallArg> arguments,
1802 typename Base::ValueRef *result,
1803 const bool variable_args) {
1806 for (
auto &arg : arguments) {
1807 cb.add_arg(std::move(arg));
1809 cb.call(std::move(target));
1811 cb.add_ret(*result);
1817 template <
typename,
typename,
typename>
typename BaseTy,
1819void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1820 const AsmReg cmp_reg,
1821 const AsmReg tmp_reg,
1822 const u64 case_value,
1823 const bool width_is_32) {
1825 ASM(CMP32ri, cmp_reg, case_value);
1827 if (i64(i32(case_value)) == i64(case_value)) {
1828 ASM(CMP64ri, cmp_reg, case_value);
1831 ASM(CMP64rr, cmp_reg, tmp_reg);
1836template <IRAdaptor Adaptor,
1838 template <
typename,
typename,
typename>
typename BaseTy,
1840void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1841 const Label case_label,
1842 const AsmReg cmp_reg,
1843 const AsmReg tmp_reg,
1844 const u64 case_value,
1845 const bool width_is_32) {
1846 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1850template <IRAdaptor Adaptor,
1852 template <
typename,
typename,
typename>
typename BaseTy,
1854FunctionWriterBase::JumpTable *
1855 CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_create_jump_table(
1856 Label default_label,
1863 bool needs_ext = width_is_32;
1864 if (low_bound > 0) {
1867 ASM(SUB32ri, cmp_reg, i32(low_bound));
1868 }
else if (i32(low_bound) == i64(low_bound)) {
1869 ASM(SUB64ri, cmp_reg, low_bound);
1872 ASM(SUB64rr, cmp_reg, tmp_reg);
1875 switch_emit_cmp(cmp_reg, tmp_reg, high_bound - low_bound, width_is_32);
1880 ASM(MOV32rr, cmp_reg, cmp_reg);
1883 u64 range = high_bound - low_bound + 1;
1884 return &this->text_writer.create_jump_table(range, cmp_reg, tmp_reg);
1887template <IRAdaptor Adaptor,
1889 template <
typename,
typename,
typename>
typename BaseTy,
1891void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
1892 const Label case_label,
1893 const Label gt_label,
1894 const AsmReg cmp_reg,
1895 const AsmReg tmp_reg,
1896 const u64 case_value,
1897 const bool width_is_32) {
1898 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
1902template <IRAdaptor Adaptor,
1904 template <
typename,
typename,
typename>
typename BaseTy,
1906CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1908 SymRef sym, TLSModel model) {
1911 case TLSModel::GlobalDynamic: {
1914 assert(!this->stack.is_leaf_function);
1915 assert(may_clobber_flags());
1916 this->stack.generated_call =
true;
1917 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1918 for (
auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1921 ScratchReg arg{
this};
1922 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1926 this->text_writer.ensure_space(0x10);
1927 *this->text_writer.cur_ptr()++ = 0x66;
1928 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1930 sym, elf::R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1931 *this->text_writer.cur_ptr()++ = 0x66;
1932 *this->text_writer.cur_ptr()++ = 0x66;
1933 *this->text_writer.cur_ptr()++ = 0x48;
1934 ASMNC(CALL, this->text_writer.cur_ptr());
1935 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1936 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1939 this->reloc_text(this->sym_tls_get_addr,
1940 elf::R_X86_64_PLT32,
1941 this->text_writer.offset() - 4,
1945 ScratchReg res{
this};
1946 res.alloc_specific(AsmReg::AX);
@ LOCAL
Symbol with local linkage, must be defined.
Helper class to write function text.
x86-64 System V calling convention.
Helper class for building call sequences.
CallBuilder(Derived &compiler, CCAssigner &assigner)
Constructor.
Helper class to write function text for X64.
The IRAdaptor specifies the interface with which the IR-independent parts of the compiler interact wi...
void label_place(Label label)
Compiler mixin for targeting x86-64.
void materialize_constant(const u64 *data, RegBank bank, u32 size, AsmReg dst)
Materialize constant into a register.
void prologue_end(CCAssigner *cc_assigner)
Finish prologue.
bool preserve_flags
Whether flags must be preserved when materializing constants etc.
void prologue_begin(CCAssigner *cc_assigner)
Begin prologue, prepare for assigning arguments.
void generate_raw_jump(Jump jmp, Label target)
Generate jump instruction to target label.
void generate_raw_intext(AsmReg dst, AsmReg src, bool sign, u32 from, u32 to)
Integer extension.
void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width)
Bitfield insert in zero.
ScratchReg tls_get_addr(SymRef sym, TLSModel model)
Generate code sequence to load address of sym into a register.
void alloca_fixed(u64 size, u32 align, ValuePart &res)
Dynamic alloca of a fixed-size region.
u32 scalar_arg_count
For vararg functions only: number of scalar and xmm registers used.
void alloca_dynamic(u64 elem_size, ValuePart &&count, u32 align, ValuePart &res)
Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
SymRef sym_tls_get_addr
Symbol for __tls_get_addr.
@ jg
Jump if greater (ZF=0 and SF=OF).
@ jno
Jump if not overflow (OF=0).
@ jnp
Jump if parity odd (PF=0).
@ jns
Jump if not sign (SF=0).
@ jl
Jump if less (SF!=OF).
@ jp
Jump if parity even (PF=1).
@ jle
Jump if less or equal (ZF=1 or SF!=OF).
@ jo
Jump if overflow (OF=1).
@ je
Jump if equal/if zero (ZF=1).
@ jge
Jump if greater or equal (SF=OF).
@ jae
Jump if above or equal/if not carry (CF=0).
@ ja
Jump if above (CF=0 and ZF=0).
@ jne
Jump if not equal/if not zero (ZF=0).
@ jb
Jump if below/if carry (CF=1).
@ jbe
Jump if below or equal (CF=1 or ZF=1).
std::optional< i32 > prologue_assign_arg_part(ValuePart &&vp, CCAssignment cca)
Assign argument part.
void generate_raw_set(Jump cc, AsmReg dst, bool zext=true)
Set dst to 1 if cc is true, otherwise set it to zero.
void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64)
Move src into dst if cc is true, otherwise do nothing.
u32 max_callee_stack_arg_size
For functions without dynamic allocas, the largest size used for arguments passed on the stack to cal...
void generate_call(std::variant< SymRef, ValuePart > &&target, std::span< CallArg > arguments, typename Base::ValueRef *result, bool variable_args=false)
Generate a function call.
void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width)
Bitfield insert. Needs a temporary register, src is not modified.
void generate_raw_mask(Jump cc, AsmReg dst)
Set all bits of dst to 1 if cc is true, otherwise set it to zero.