6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/base.hpp"
10#include "tpde/x64/FunctionWriterX64.hpp"
15#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
16 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
21#define ASM_FULL(compiler, reserve, op, ...) \
22 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
24#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
25#define ASMC(compiler, op, ...) \
26 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
27#define ASMF(op, flag, ...) \
28 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
29#define ASMNCF(op, flag, ...) \
30 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
31#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
73 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
75 constexpr AsmReg(
const REG
id) noexcept : Reg((u8)
id) {}
77 constexpr AsmReg(
const Reg base) noexcept : Reg(base) {}
79 constexpr explicit AsmReg(
const u8
id) noexcept : Reg(
id) {
80 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
83 constexpr explicit AsmReg(
const u64
id) noexcept : Reg(
id) {
84 assert(
id <= R15 || (
id >= XMM0 &&
id <= XMM15));
87 constexpr operator FeRegGP() const noexcept {
88 assert(reg_id <= R15);
89 return FeRegGP{reg_id};
92 operator FeRegGPLH() const noexcept {
93 assert(reg_id <= R15);
94 return FeRegGP{reg_id};
97 constexpr operator FeRegXMM() const noexcept {
98 assert(reg_id >= XMM0 && reg_id <= XMM15);
99 return FeRegXMM{
static_cast<u8
>(reg_id & 0x1F)};
104 create_bitmask(
const std::initializer_list<AsmReg::REG> regs) {
106 for (
const auto reg : regs) {
113constexpr static u64 create_bitmask(
const std::array<AsmReg, N> regs) {
115 for (
const auto reg : regs) {
116 set |= 1ull << reg.id();
121class CCAssignerSysV :
public CCAssigner {
123 static constexpr CCInfo Info{
125 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
126 .callee_saved_regs = create_bitmask({
133 .arg_regs = create_bitmask({
149 .red_zone_size = 128,
153 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
155 unsigned must_assign_stack = 0;
157 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
160 CCAssignerSysV(
bool vararg =
false) noexcept
161 : CCAssigner(Info), vararg(vararg) {}
163 void reset() noexcept
override {
164 gp_cnt = xmm_cnt = stack = 0;
165 must_assign_stack = 0;
167 ret_gp_cnt = ret_xmm_cnt = 0;
170 void assign_arg(CCAssignment &arg)
noexcept override {
172 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
173 arg.stack_off = stack;
178 if (arg.bank == RegBank{0}) {
179 static constexpr std::array<AsmReg, 6> gp_arg_regs{
187 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
188 arg.reg = gp_arg_regs[gp_cnt];
193 must_assign_stack = arg.consecutive + 1;
194 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
195 arg.stack_off = stack;
199 if (!must_assign_stack && xmm_cnt < 8) {
200 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
205 must_assign_stack = arg.consecutive + 1;
206 u32 size = util::align_up(arg.size, 8);
207 stack = util::align_up(stack, size);
208 arg.stack_off = stack;
213 if (must_assign_stack > 0) {
214 must_assign_stack -= 1;
218 u32 get_stack_size() noexcept
override {
return stack; }
220 bool is_vararg() const noexcept
override {
return vararg; }
222 void assign_ret(CCAssignment &arg)
noexcept override {
223 assert(!arg.byval && !arg.sret);
224 if (arg.bank == RegBank{0}) {
225 if (ret_gp_cnt + arg.consecutive < 2) {
226 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
232 if (ret_xmm_cnt + arg.consecutive < 2) {
233 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
242struct PlatformConfig : CompilerConfigDefault {
243 using Assembler = AssemblerElfX64;
244 using AsmReg = tpde::x64::AsmReg;
245 using DefaultCCAssigner = CCAssignerSysV;
246 using FunctionWriter = FunctionWriterX64;
248 static constexpr RegBank GP_BANK{0};
249 static constexpr RegBank FP_BANK{1};
250 static constexpr bool FRAME_INDEXING_NEGATIVE =
true;
251 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
252 static constexpr u32 NUM_BANKS = 2;
256template <
typename T,
typename Config>
257concept Compiler = tpde::Compiler<T, Config> &&
requires(T a) {
259 a.arg_is_int128(std::declval<typename T::IRValueRef>())
260 } -> std::convertible_to<bool>;
263 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
264 } -> std::convertible_to<bool>;
268template <IRAdaptor Adaptor,
270 template <
typename,
typename,
typename>
typename BaseTy =
272 typename Config = PlatformConfig>
273struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
274 using Base = BaseTy<Adaptor, Derived, Config>;
276 using IRValueRef =
typename Base::IRValueRef;
277 using IRBlockRef =
typename Base::IRBlockRef;
278 using IRFuncRef =
typename Base::IRFuncRef;
280 using ScratchReg =
typename Base::ScratchReg;
281 using ValuePartRef =
typename Base::ValuePartRef;
282 using ValuePart =
typename Base::ValuePart;
283 using GenericValuePart =
typename Base::GenericValuePart;
285 using Assembler =
typename PlatformConfig::Assembler;
286 using RegisterFile =
typename Base::RegisterFile;
288 using CallArg =
typename Base::CallArg;
295 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
298 enum CPU_FEATURES : u32 {
300 CPU_CMPXCHG16B = (1 << 0),
301 CPU_POPCNT = (1 << 1),
303 CPU_SSSE3 = (1 << 3),
304 CPU_SSE4_1 = (1 << 4),
305 CPU_SSE4_2 = (1 << 5),
310 CPU_F16C = (1 << 10),
312 CPU_LZCNT = (1 << 12),
313 CPU_MOVBE = (1 << 13),
314 CPU_AVX512F = (1 << 14),
315 CPU_AVX512BW = (1 << 15),
316 CPU_AVX512CD = (1 << 16),
317 CPU_AVX512DQ = (1 << 17),
318 CPU_AVX512VL = (1 << 18),
320 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
321 CPU_SSE4_1 | CPU_SSE4_2,
322 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
323 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
324 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
328 CPU_FEATURES cpu_feats = CPU_BASELINE;
337 u64 fixed_assignment_nonallocatable_mask =
338 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
339 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
340 func_reg_restore_alloc = 0u;
343 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
344 u32 reg_save_frame_off = 0;
345 u32 var_arg_stack_off = 0;
346 util::SmallVector<u32, 8> func_ret_offs = {};
351 u32 max_callee_stack_arg_size;
357 SymRef sym_tls_get_addr;
359 class CallBuilder :
public Base::template CallBuilderBase<CallBuilder> {
360 u32 stack_adjust_off = 0;
362 void set_stack_used() noexcept;
365 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
366 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
368 void add_arg_byval(ValuePart &vp, CCAssignment &cca)
noexcept;
369 void add_arg_stack(ValuePart &vp, CCAssignment &cca)
noexcept;
370 void call_impl(std::variant<SymRef, ValuePart> &&target)
noexcept;
371 void reset_stack() noexcept;
375 explicit CompilerX64(Adaptor *adaptor,
376 const CPU_FEATURES cpu_features = CPU_BASELINE)
377 : Base{adaptor}, cpu_feats(cpu_features) {
378 static_assert(std::is_base_of_v<CompilerX64, Derived>);
379 static_assert(concepts::Compiler<Derived, PlatformConfig>);
382 template <
typename... Args>
383 auto asm_helper(
unsigned (*enc_fn)(u8 *,
int, Args...)) {
385 CompilerX64 *compiler;
387 void encode(
unsigned reserve,
int flags, Args... args) {
389 compiler->text_writer.ensure_space(reserve);
391 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
393 compiler->text_writer.cur_ptr() += n;
396 return Helper{
this, enc_fn};
399 void start_func(u32 func_idx)
noexcept;
401 void gen_func_prolog_and_args(CCAssigner *)
noexcept;
403 void finish_func(u32 func_idx)
noexcept;
405 void reset() noexcept;
409 void gen_func_epilog() noexcept;
411 void set_preserve_flags(
bool preserve) noexcept { preserve_flags = preserve; }
412 bool may_clobber_flags() noexcept {
return !preserve_flags; }
415 spill_reg(
const AsmReg reg,
const i32 frame_off,
const u32 size)
noexcept;
417 void load_from_stack(AsmReg dst,
420 bool sign_extend =
false) noexcept;
422 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
424 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
426 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
427 assert(ap.stack_valid() && !ap.variable_ref());
428 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
431 AsmReg gval_expr_as_reg(GenericValuePart &gv)
noexcept;
434 void alloca_fixed(u64 size, u32 align, ValuePart &res)
noexcept;
438 void alloca_dynamic(u64 elem_size,
441 ValuePart &res)
noexcept;
443 void materialize_constant(
const u64 *data,
446 AsmReg dst)
noexcept;
448 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef)
noexcept;
471 Jump invert_jump(Jump jmp)
noexcept;
472 Jump swap_jump(Jump jmp)
noexcept;
474 FeCond jump_to_cond(Jump jmp)
noexcept;
476 void generate_branch_to_block(Jump jmp,
479 bool last_inst)
noexcept;
481 void generate_raw_jump(Jump jmp, Label target)
noexcept;
485 void generate_raw_set(Jump cc, AsmReg dst,
bool zext =
true) noexcept;
487 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
489 void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src,
bool is_64) noexcept;
491 void generate_raw_intext(
492 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to) noexcept;
495 void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept;
498 void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept;
510 void generate_call(std::variant<SymRef, ValuePart> &&target,
511 std::span<CallArg> arguments,
512 typename Base::ValueRef *result,
513 bool variable_args = false);
517 void switch_emit_cmp(AsmReg cmp_reg,
520 bool width_is_32) noexcept;
524 void switch_emit_cmpeq(Label case_label,
528 bool width_is_32) noexcept;
530 bool switch_emit_jump_table(Label default_label,
531 std::span<const Label> labels,
536 bool width_is_32) noexcept;
538 void switch_emit_binary_step(Label case_label,
543 bool width_is_32) noexcept;
547 ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept;
549 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
550 return ((cpu_feats & feats) == feats);
554template <IRAdaptor Adaptor,
556 template <
typename,
typename,
typename>
class BaseTy,
558void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
559 const u32 )
noexcept {
560 this->text_writer.align(16);
561 this->assembler.except_begin_func();
562 this->preserve_flags =
false;
565template <IRAdaptor Adaptor,
567 template <
typename,
typename,
typename>
typename BaseTy,
569void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
570 CCAssigner *cc_assigner)
noexcept {
586 func_ret_offs.clear();
587 func_start_off = this->text_writer.offset();
588 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
590 const CCInfo &cc_info = cc_assigner->get_ccinfo();
593 ASM(MOV64rr, FE_BP, FE_SP);
595 func_reg_save_off = this->text_writer.offset();
597 auto csr = cc_info.callee_saved_regs;
598 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
599 "non-gp callee-saved registers not implemented");
601 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
602 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
604 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp + 7;
605 this->stack.frame_size = 8 * (csr_logp + csr_higp);
606 max_callee_stack_arg_size = 0;
608 this->text_writer.ensure_space(reg_save_size);
609 this->text_writer.cur_ptr() += reg_save_size;
610 func_reg_save_alloc = reg_save_size;
612 func_reg_restore_alloc = reg_save_size;
616 if (this->adaptor->cur_is_vararg()) {
617 this->stack.frame_size += 6 * 8 + 8 * 16;
618 reg_save_frame_off = this->stack.frame_size;
619 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
620 ASM(MOV64mr, mem, FE_DI);
622 ASM(MOV64mr, mem, FE_SI);
624 ASM(MOV64mr, mem, FE_DX);
626 ASM(MOV64mr, mem, FE_CX);
628 ASM(MOV64mr, mem, FE_R8);
630 ASM(MOV64mr, mem, FE_R9);
631 auto skip_fp = this->text_writer.label_create();
632 ASM(TEST8rr, FE_AX, FE_AX);
633 generate_raw_jump(Jump::je, skip_fp);
635 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
637 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
639 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
641 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
643 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
645 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
647 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
649 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
650 this->label_place(skip_fp);
654 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
655 "argument registers must also be allocatable");
656 this->register_file.allocatable &= ~cc_info.arg_regs;
659 for (
const IRValueRef arg : this->adaptor->cur_args()) {
663 [&](ValuePart &&vp, CCAssignment cca) -> std::optional<i32> {
665 cca.bank = vp.bank();
666 cca.size = vp.part_size();
669 cc_assigner->assign_arg(cca);
671 if (cca.reg.valid()) [[likely]] {
672 vp.set_value_reg(
this, cca.reg);
676 this->register_file.allocatable |= u64{1} << cca.reg.id();
687 return 0x10 + cca.stack_off;
693 AsmReg dst = vp.alloc_reg(
this);
694 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
702 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
704 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
705 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
706 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
707 this->scalar_arg_count = std::popcount(gp_regs);
708 this->vec_arg_count = std::popcount(xmm_regs);
709 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
712 this->register_file.allocatable |= cc_info.arg_regs;
715template <IRAdaptor Adaptor,
717 template <
typename,
typename,
typename>
typename BaseTy,
719void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
720 u32 func_idx)
noexcept {
722 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
724 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
725 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
726 this->assembler.eh_write_inst(
727 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
729 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
730 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
731 dwarf::x64::DW_reg_rbp);
734 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
735 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
737 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
738 const CCInfo &ccinfo =
derived()->cur_cc_assigner()->get_ccinfo();
739 auto csr = ccinfo.callee_saved_regs;
740 u64 saved_regs = this->register_file.clobbered & csr;
741 u32 num_saved_regs = 0u;
742 for (
auto reg : util::BitSetIterator{saved_regs}) {
743 assert(reg <= AsmReg::R15);
745 fe64_PUSHr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
751 static const u8 gpreg_to_dwarf[] = {
752 dwarf::x64::DW_reg_rax,
753 dwarf::x64::DW_reg_rcx,
754 dwarf::x64::DW_reg_rdx,
755 dwarf::x64::DW_reg_rbx,
756 dwarf::x64::DW_reg_rsp,
757 dwarf::x64::DW_reg_rbp,
758 dwarf::x64::DW_reg_rsi,
759 dwarf::x64::DW_reg_rdi,
760 dwarf::x64::DW_reg_r8,
761 dwarf::x64::DW_reg_r9,
762 dwarf::x64::DW_reg_r10,
763 dwarf::x64::DW_reg_r11,
764 dwarf::x64::DW_reg_r12,
765 dwarf::x64::DW_reg_r13,
766 dwarf::x64::DW_reg_r14,
767 dwarf::x64::DW_reg_r15,
769 u8 dwarf_reg = gpreg_to_dwarf[reg];
770 auto cfa_off = num_saved_regs + 2;
771 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
774 assert((!this->stack.has_dynamic_alloca || max_callee_stack_arg_size == 0) &&
775 "stack with dynamic alloca must adjust stack pointer at call sites");
778 u32 final_frame_size =
779 util::align_up(this->stack.frame_size + max_callee_stack_arg_size, 16);
780 u32 rsp_adjustment = final_frame_size - num_saved_regs * 8;
781 bool needs_rsp_adjustment = this->stack.generated_call ||
782 this->stack.has_dynamic_alloca ||
783 rsp_adjustment > ccinfo.red_zone_size;
785 if (needs_rsp_adjustment) {
786 write_ptr += fe64_SUB64ri(write_ptr, 0, FE_SP, rsp_adjustment);
790 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
791 assert(prologue_size < 0x44);
792 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
793 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
796 const auto reg_save_end =
797 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
798 assert(reg_save_end >= write_ptr);
799 const u32 nop_len = reg_save_end - write_ptr;
801 fe64_NOP(write_ptr, nop_len);
804 auto func_sym = this->func_syms[func_idx];
805 auto func_sec = this->text_writer.get_sec_ref();
806 if (func_ret_offs.empty()) {
808 auto func_size = this->text_writer.offset() - func_start_off;
809 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
810 this->assembler.eh_end_fde(fde_off, func_sym);
811 this->assembler.except_encode_func(func_sym,
812 this->text_writer.label_offsets.data());
816 auto *text_data = this->text_writer.begin_ptr();
817 u32 first_ret_off = func_ret_offs[0];
819 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc;
820 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
822 write_ptr = text_data + first_ret_off;
823 const auto ret_start = write_ptr;
824 if (this->stack.has_dynamic_alloca) {
825 if (num_saved_regs == 0) {
826 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
829 fe64_LEA64rm(write_ptr,
832 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
834 }
else if (needs_rsp_adjustment) {
835 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, rsp_adjustment);
837 for (
auto reg : util::BitSetIterator<true>{saved_regs}) {
838 assert(reg <= AsmReg::R15);
840 fe64_POPr(write_ptr, 0, AsmReg{
static_cast<AsmReg::REG
>(reg)});
842 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
843 write_ptr += fe64_RET(write_ptr, 0);
844 ret_size = write_ptr - ret_start;
845 assert(ret_size <= epilogue_size &&
"function epilogue too long");
848 if (epilogue_size > ret_size) {
849 fe64_NOP(write_ptr, epilogue_size - ret_size);
850 if (first_ret_off == func_end_ret_off) {
851 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
856 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
858 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
859 if (func_ret_offs[i] == func_end_ret_off) {
860 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
867 auto func_size = this->text_writer.offset() - func_start_off;
868 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
869 this->assembler.eh_end_fde(fde_off, func_sym);
870 this->assembler.except_encode_func(func_sym,
871 this->text_writer.label_offsets.data());
874template <IRAdaptor Adaptor,
876 template <
typename,
typename,
typename>
typename BaseTy,
878void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
879 func_ret_offs.clear();
880 sym_tls_get_addr = {};
884template <IRAdaptor Adaptor,
886 template <
typename,
typename,
typename>
typename BaseTy,
888void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
902 func_ret_offs.push_back(this->text_writer.offset());
910 func_reg_restore_alloc;
912 this->text_writer.ensure_space(epilogue_size);
913 this->text_writer.cur_ptr() += epilogue_size;
916template <IRAdaptor Adaptor,
918 template <
typename,
typename,
typename>
typename BaseTy,
920void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
921 const AsmReg reg,
const i32 frame_off,
const u32 size)
noexcept {
922 this->text_writer.ensure_space(16);
923 assert(frame_off < 0);
924 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
925 if (reg.id() <= AsmReg::R15) {
927 case 1: ASMNC(MOV8mr, mem, reg);
break;
928 case 2: ASMNC(MOV16mr, mem, reg);
break;
929 case 4: ASMNC(MOV32mr, mem, reg);
break;
930 case 8: ASMNC(MOV64mr, mem, reg);
break;
931 default: TPDE_UNREACHABLE(
"invalid spill size");
937 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg);
break;
938 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg);
break;
939 case 16: ASMNC(SSE_MOVAPDmr, mem, reg);
break;
940 default: TPDE_UNREACHABLE(
"invalid spill size");
944template <IRAdaptor Adaptor,
946 template <
typename,
typename,
typename>
typename BaseTy,
948void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
952 const bool sign_extend)
noexcept {
953 this->text_writer.ensure_space(16);
954 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
956 if (dst.id() <= AsmReg::R15) {
959 case 1: ASMNC(MOVZXr32m8, dst, mem);
break;
960 case 2: ASMNC(MOVZXr32m16, dst, mem);
break;
961 case 4: ASMNC(MOV32rm, dst, mem);
break;
962 case 8: ASMNC(MOV64rm, dst, mem);
break;
963 default: TPDE_UNREACHABLE(
"invalid spill size");
967 case 1: ASMNC(MOVSXr64m8, dst, mem);
break;
968 case 2: ASMNC(MOVSXr64m16, dst, mem);
break;
969 case 4: ASMNC(MOVSXr64m32, dst, mem);
break;
970 case 8: ASMNC(MOV64rm, dst, mem);
break;
971 default: TPDE_UNREACHABLE(
"invalid spill size");
977 assert(!sign_extend);
980 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem);
break;
981 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem);
break;
982 case 16: ASMNC(SSE_MOVAPDrm, dst, mem);
break;
983 default: TPDE_UNREACHABLE(
"invalid spill size");
987template <IRAdaptor Adaptor,
989 template <
typename,
typename,
typename>
typename BaseTy,
991void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
992 const AsmReg dst,
const AssignmentPartRef ap)
noexcept {
993 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
996template <IRAdaptor Adaptor,
998 template <
typename,
typename,
typename>
typename BaseTy,
1000void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
1001 const AsmReg dst,
const AsmReg src,
const u32 size)
noexcept {
1002 this->text_writer.ensure_space(16);
1003 assert(dst.valid());
1004 assert(src.valid());
1005 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
1007 ASMNC(MOV64rr, dst, src);
1009 ASMNC(MOV32rr, dst, src);
1011 }
else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
1013 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
1014 assert(has_cpu_feats(CPU_AVX512F));
1015 ASMNC(VMOVAPD128rr, dst, src);
1017 ASMNC(SSE_MOVAPDrr, dst, src);
1019 }
else if (size <= 32) {
1020 assert(has_cpu_feats(CPU_AVX));
1021 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
1022 has_cpu_feats(CPU_AVX512F));
1023 ASMNC(VMOVAPD256rr, dst, src);
1026 assert(has_cpu_feats(CPU_AVX512F));
1027 ASMNC(VMOVAPD512rr, dst, src);
1029 }
else if (dst.id() <= AsmReg::R15) {
1031 assert(src.id() >= AsmReg::XMM0);
1033 if (src.id() > AsmReg::XMM15) {
1034 assert(has_cpu_feats(CPU_AVX512F));
1036 ASMNC(VMOVD_X2Grr, dst, src);
1038 ASMNC(VMOVQ_X2Grr, dst, src);
1042 ASMNC(SSE_MOVD_X2Grr, dst, src);
1044 ASMNC(SSE_MOVQ_X2Grr, dst, src);
1049 assert(src.id() <= AsmReg::R15);
1050 assert(dst.id() >= AsmReg::XMM0);
1052 if (dst.id() > AsmReg::XMM15) {
1053 assert(has_cpu_feats(CPU_AVX512F));
1055 ASMNC(VMOVD_G2Xrr, dst, src);
1057 ASMNC(VMOVQ_G2Xrr, dst, src);
1061 ASMNC(SSE_MOVD_G2Xrr, dst, src);
1063 ASMNC(SSE_MOVQ_G2Xrr, dst, src);
1069template <IRAdaptor Adaptor,
1071 template <
typename,
typename,
typename>
typename BaseTy,
1073AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1074 GenericValuePart &gv)
noexcept {
1075 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1077 ScratchReg scratch{
derived()};
1078 bool disp32 = i32(expr.disp) == expr.disp;
1079 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1080 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1081 if (std::holds_alternative<ScratchReg>(expr.base)) {
1082 scratch = std::move(std::get<ScratchReg>(expr.base));
1083 }
else if (std::holds_alternative<ScratchReg>(expr.index)) {
1084 scratch = std::move(std::get<ScratchReg>(expr.index));
1086 (void)scratch.alloc_gp();
1088 auto dst = scratch.cur_reg();
1090 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1092 if (base.valid() && disp32) {
1093 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1095 }
else if (base.valid()) {
1096 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1097 }
else if (disp32) {
1098 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1100 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1103 assert(may_clobber_flags());
1104 u64 scale = expr.scale;
1106 base = AsmReg::make_invalid();
1110 ScratchReg idx_scratch{
derived()};
1112 AsmReg idx_tmp = dst;
1113 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1116 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1117 }
else if (dst == base) {
1118 idx_tmp = idx_scratch.alloc_gp();
1121 if ((scale & (scale - 1)) == 0) {
1122 if (idx_tmp != idx) {
1123 ASM(MOV64rr, idx_tmp, idx);
1125 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1127 if (i32(scale) == i64(scale)) {
1128 ASM(IMUL64rri, idx_tmp, idx, scale);
1130 ScratchReg scratch2{
derived()};
1131 auto tmp2 = scratch2.alloc_gp();
1132 ASM(MOV64ri, tmp2, scale);
1133 if (idx_tmp != idx) {
1134 ASM(MOV64rr, idx_tmp, idx);
1136 ASM(IMUL64rr, idx_tmp, tmp2);
1140 if (disp32 || (idx_tmp != dst && base != dst)) {
1141 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1143 }
else if (dst == base) {
1144 ASM(ADD64rr, dst, idx_tmp);
1146 ASM(ADD64rr, dst, base);
1150 }
else if (base.valid()) {
1151 if (expr.disp && disp32) {
1152 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1154 }
else if (dst != base) {
1155 ASM(MOV64rr, dst, base);
1159 ScratchReg scratch2{
derived()};
1160 auto tmp2 = scratch2.alloc_gp();
1161 ASM(MOV64ri, tmp2, expr.disp);
1162 if (may_clobber_flags()) {
1163 ASM(ADD64rr, dst, tmp2);
1165 ASM(LEA64rm, dst, FE_MEM(dst, 1, tmp2, 0));
1168 gv.state = std::move(scratch);
1172template <IRAdaptor Adaptor,
1174 template <
typename,
typename,
typename>
typename BaseTy,
1176void CompilerX64<Adaptor, Derived, BaseTy, Config>::alloca_fixed(
1177 u64 size, u32 align, ValuePart &res)
noexcept {
1178 assert(this->stack.has_dynamic_alloca &&
1179 "function marked as not having dynamic allocas can't have alloca");
1180 assert(align != 0 && (align & (align - 1)) == 0 &&
"invalid alignment");
1181 assert(may_clobber_flags());
1182 size = tpde::util::align_up(size, 16);
1184 assert(size < 0x8000'0000);
1185 ASM(SUB64ri, FE_SP, size);
1188 assert(align < u32{1} << 31 &&
"alignment >= 2**31 not implemented");
1189 ASM(AND64ri, FE_SP, ~(align - 1));
1191 ASM(MOV64rr, res.alloc_reg(
this), FE_SP);
1194template <IRAdaptor Adaptor,
1196 template <
typename,
typename,
typename>
typename BaseTy,
1198void CompilerX64<Adaptor, Derived, BaseTy, Config>::alloca_dynamic(
1199 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res)
noexcept {
1200 assert(this->stack.has_dynamic_alloca &&
1201 "function marked as not having dynamic allocas can't have alloca");
1202 assert(align != 0 && (align & (align - 1)) == 0 &&
"invalid alignment");
1203 assert(may_clobber_flags());
1204 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(
this);
1205 AsmReg res_reg = res.alloc_try_reuse(
this, count);
1207 if (elem_size == 0) {
1208 ASM(XOR32rr, res_reg, res_reg);
1209 }
else if ((elem_size & (elem_size - 1)) == 0) {
1211 const auto shift = util::cnt_tz(elem_size);
1212 if (shift > 0 && shift < 4) {
1213 ASM(LEA64rm, res_reg, FE_MEM(FE_NOREG, u8(1 << shift), size_reg, 0));
1215 if (size_reg != res_reg) {
1216 ASM(MOV64rr, res_reg, size_reg);
1218 if (elem_size != 1) {
1219 ASM(SHL64ri, res_reg, shift);
1223 if (elem_size <= 0x7FFF'FFFF) [[likely]] {
1224 ASM(IMUL64rri, res_reg, size_reg, elem_size);
1226 ScratchReg scratch{
this};
1227 auto tmp = scratch.alloc_gp();
1228 ASM(MOV64ri, tmp, elem_size);
1229 if (size_reg != res_reg) {
1230 ASM(MOV64rr, res_reg, size_reg);
1232 ASM(IMUL64rr, res_reg, tmp);
1236 ASM(SUB64rr, FE_SP, res_reg);
1238 align = align > 16 ? align : 16;
1239 if (elem_size & (align - 1)) {
1240 assert(align < u32{1} << 31 &&
"alignment >= 2**31 not implemented");
1241 ASM(AND64ri, FE_SP, ~(align - 1));
1244 ASM(MOV64rr, res_reg, FE_SP);
1247template <IRAdaptor Adaptor,
1249 template <
typename,
typename,
typename>
typename BaseTy,
1251void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1252 const u64 *data,
const RegBank bank,
const u32 size, AsmReg dst)
noexcept {
1253 const auto const_u64 = data[0];
1254 if (bank == Config::GP_BANK) {
1256 if (const_u64 == 0) {
1257 if (may_clobber_flags()) {
1258 ASM(XOR32rr, dst, dst);
1260 ASM(MOV32ri, dst, 0);
1265 if (size <= 4 || u32(const_u64) == const_u64) {
1266 ASM(MOV32ri, dst, const_u64);
1268 ASM(MOV64ri, dst, const_u64);
1273 assert(bank == Config::FP_BANK);
1274 const auto high_u64 = size <= 8 ? 0 : data[1];
1275 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1276 if (has_cpu_feats(CPU_AVX)) {
1277 ASM(VPXOR128rrr, dst, dst, dst);
1279 ASM(SSE_PXORrr, dst, dst);
1283 const u64 ones = -u64{1};
1284 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1285 if (has_cpu_feats(CPU_AVX)) {
1286 ASM(VPCMPEQB128rrr, dst, dst, dst);
1288 ASM(SSE_PCMPEQBrr, dst, dst);
1297 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1299 this->register_file.mark_clobbered(tmp);
1300 materialize_constant(data, Config::GP_BANK, size, tmp);
1302 if (has_cpu_feats(CPU_AVX)) {
1303 ASM(VMOVD_G2Xrr, dst, tmp);
1305 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1308 if (has_cpu_feats(CPU_AVX)) {
1309 ASM(VMOVQ_G2Xrr, dst, tmp);
1311 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1320 auto alloc_size = util::align_up(size, 8);
1321 std::span<const u8> raw_data{
reinterpret_cast<const u8 *
>(data), alloc_size};
1323 auto rodata = this->assembler.get_data_section(
true,
false);
1324 auto sym = this->assembler.sym_def_data(
1325 rodata,
"", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1327 if (has_cpu_feats(CPU_AVX)) {
1328 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1330 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1332 }
else if (size <= 8) {
1333 if (has_cpu_feats(CPU_AVX)) {
1334 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1336 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1338 }
else if (size <= 16) {
1339 if (has_cpu_feats(CPU_AVX)) {
1340 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1342 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1346 TPDE_FATAL(
"unable to materialize constant");
1349 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1352template <IRAdaptor Adaptor,
1354 template <
typename,
typename,
typename>
typename BaseTy,
1357 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1358 AssignmentPartRef ap, IRValueRef)
noexcept {
1359 RegBank bank = ap.bank();
1360 assert(bank.id() <= Config::NUM_BANKS);
1361 auto reg_mask = this->register_file.bank_regs(bank);
1362 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1364 const auto find_possible_regs = [
this,
1365 reg_mask](
const u64 preferred_regs) -> u64 {
1367 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1368 return free_regs & preferred_regs & reg_mask;
1372 auto csr =
derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1373 if (!this->stack.is_leaf_function) {
1375 possible_regs = find_possible_regs(csr);
1379 possible_regs = find_possible_regs(~csr);
1380 if (possible_regs == 0) {
1382 possible_regs = find_possible_regs(csr);
1386 if (possible_regs == 0) {
1387 return AsmReg::make_invalid();
1391 if ((possible_regs & ~this->register_file.used) != 0) {
1392 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1395 for (
const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1396 const auto reg = AsmReg{reg_id};
1398 if (this->register_file.is_fixed(reg)) {
1402 const auto local_idx = this->register_file.reg_local_idx(reg);
1403 const auto part = this->register_file.reg_part(reg);
1405 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1408 auto *assignment = this->val_assignment(local_idx);
1409 auto ap = AssignmentPartRef{assignment, part};
1410 if (ap.modified()) {
1417 return AsmReg::make_invalid();
1420template <IRAdaptor Adaptor,
1422 template <
typename,
typename,
typename>
typename BaseTy,
1424typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1425 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1426 Jump jmp)
noexcept {
1428 case Jump::ja:
return Jump::jbe;
1429 case Jump::jae:
return Jump::jb;
1430 case Jump::jb:
return Jump::jae;
1431 case Jump::jbe:
return Jump::ja;
1432 case Jump::je:
return Jump::jne;
1433 case Jump::jg:
return Jump::jle;
1434 case Jump::jge:
return Jump::jl;
1435 case Jump::jl:
return Jump::jge;
1436 case Jump::jle:
return Jump::jg;
1437 case Jump::jne:
return Jump::je;
1438 case Jump::jno:
return Jump::jo;
1439 case Jump::jo:
return Jump::jno;
1440 case Jump::js:
return Jump::jns;
1441 case Jump::jns:
return Jump::js;
1442 case Jump::jp:
return Jump::jnp;
1443 case Jump::jnp:
return Jump::jp;
1444 default: TPDE_UNREACHABLE(
"invalid jump kind for invert_jump");
1448template <IRAdaptor Adaptor,
1450 template <
typename,
typename,
typename>
class BaseTy,
1452typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1453 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1454 Jump jmp)
noexcept {
1456 case Jump::ja:
return Jump::jb;
1457 case Jump::jae:
return Jump::jbe;
1458 case Jump::jb:
return Jump::ja;
1459 case Jump::jbe:
return Jump::jae;
1460 case Jump::je:
return Jump::je;
1461 case Jump::jne:
return Jump::jne;
1462 case Jump::jg:
return Jump::jl;
1463 case Jump::jge:
return Jump::jle;
1464 case Jump::jl:
return Jump::jg;
1465 case Jump::jle:
return Jump::jge;
1466 default: TPDE_UNREACHABLE(
"invalid jump kind for swap_jump");
1470template <IRAdaptor Adaptor,
1472 template <
typename,
typename,
typename>
class BaseTy,
1474FeCond CompilerX64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(
1475 Jump jmp)
noexcept {
1477 FeCond res = FeCond(u32(jmp) << 16);
1479 case Jump::ja: assert(res == FE_CC_A &&
"FeCond value mismatch?");
break;
1480 case Jump::jae: assert(res == FE_CC_AE &&
"FeCond value mismatch?");
break;
1481 case Jump::jb: assert(res == FE_CC_B &&
"FeCond value mismatch?");
break;
1482 case Jump::jbe: assert(res == FE_CC_BE &&
"FeCond value mismatch?");
break;
1483 case Jump::je: assert(res == FE_CC_E &&
"FeCond value mismatch?");
break;
1484 case Jump::jg: assert(res == FE_CC_G &&
"FeCond value mismatch?");
break;
1485 case Jump::jge: assert(res == FE_CC_GE &&
"FeCond value mismatch?");
break;
1486 case Jump::jl: assert(res == FE_CC_L &&
"FeCond value mismatch?");
break;
1487 case Jump::jle: assert(res == FE_CC_LE &&
"FeCond value mismatch?");
break;
1488 case Jump::jne: assert(res == FE_CC_NE &&
"FeCond value mismatch?");
break;
1489 case Jump::jno: assert(res == FE_CC_NO &&
"FeCond value mismatch?");
break;
1490 case Jump::jo: assert(res == FE_CC_O &&
"FeCond value mismatch?");
break;
1491 case Jump::js: assert(res == FE_CC_S &&
"FeCond value mismatch?");
break;
1492 case Jump::jns: assert(res == FE_CC_NS &&
"FeCond value mismatch?");
break;
1493 case Jump::jp: assert(res == FE_CC_P &&
"FeCond value mismatch?");
break;
1494 case Jump::jnp: assert(res == FE_CC_NP &&
"FeCond value mismatch?");
break;
1495 default: TPDE_UNREACHABLE(
"invalid conditional jump");
1500template <IRAdaptor Adaptor,
1502 template <
typename,
typename,
typename>
typename BaseTy,
1504void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1507 const bool needs_split,
1508 const bool last_inst)
noexcept {
1509 const auto target_idx = this->analyzer.block_idx(target);
1510 if (!needs_split || jmp == Jump::jmp) {
1511 this->
derived()->move_to_phi_nodes(target_idx);
1513 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1514 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1517 auto tmp_label = this->text_writer.label_create();
1518 generate_raw_jump(invert_jump(jmp), tmp_label);
1520 this->
derived()->move_to_phi_nodes(target_idx);
1522 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1524 this->label_place(tmp_label);
1528template <IRAdaptor Adaptor,
1530 template <
typename,
typename,
typename>
typename BaseTy,
1532void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1533 Jump jmp, Label target_label)
noexcept {
1534 this->text_writer.ensure_space(6);
1535 bool pending = this->text_writer.label_is_pending(target_label);
1536 void *target = this->text_writer.cur_ptr();
1538 target = this->text_writer.begin_ptr() +
1539 this->text_writer.label_offset(target_label);
1542 if (jmp == Jump::jmp) {
1543 ASMNCF(JMP, pending ? FE_JMPL : 0, target);
1545 ASMNCF(Jcc, (pending ? FE_JMPL : 0) | jump_to_cond(jmp), target);
1549 this->text_writer.label_ref(target_label,
1550 this->text_writer.offset() - 4,
1551 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1555template <IRAdaptor Adaptor,
1557 template <
typename,
typename,
typename>
class BaseTy,
1559void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1560 Jump cc, AsmReg dst,
bool zext)
noexcept {
1562 ASM(MOV32ri, dst, 0);
1564 ASMF(SETcc8r, jump_to_cond(cc), dst);
1567template <IRAdaptor Adaptor,
1569 template <
typename,
typename,
typename>
class BaseTy,
1571void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1572 Jump cc, AsmReg dst)
noexcept {
1574 generate_raw_set(cc, dst);
1577template <IRAdaptor Adaptor,
1579 template <
typename,
typename,
typename>
class BaseTy,
1581void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_cmov(
1582 Jump cc, AsmReg dst, AsmReg src,
bool is_64)
noexcept {
1584 ASMF(CMOVcc64rr, jump_to_cond(cc), dst, src);
1586 ASMF(CMOVcc32rr, jump_to_cond(cc), dst, src);
1590template <IRAdaptor Adaptor,
1592 template <
typename,
typename,
typename>
class BaseTy,
1594void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1595 AsmReg dst, AsmReg src,
bool sign, u32 from, u32 to)
noexcept {
1596 assert(from < to && to <= 64);
1597 assert(may_clobber_flags());
1600 case 8: ASM(MOVZXr32r8, dst, src);
break;
1601 case 16: ASM(MOVZXr32r16, dst, src);
break;
1602 case 32: ASM(MOV32rr, dst, src);
break;
1606 ASM(MOV32rr, dst, src);
1608 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1609 }
else if (dst != src) {
1610 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1611 ASM(AND64rr, dst, src);
1613 ScratchReg tmp{
this};
1614 AsmReg tmp_reg = tmp.alloc_gp();
1615 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1616 ASM(AND64rr, dst, tmp_reg);
1619 }
else if (to <= 32) {
1621 case 8: ASM(MOVSXr32r8, dst, src);
break;
1622 case 16: ASM(MOVSXr32r16, dst, src);
break;
1625 ASM(MOV32rr, dst, src);
1627 ASM(SHL32ri, dst, 32 - from);
1628 ASM(SAR32ri, dst, 32 - from);
1632 case 8: ASM(MOVSXr64r8, dst, src);
break;
1633 case 16: ASM(MOVSXr64r16, dst, src);
break;
1634 case 32: ASM(MOVSXr64r32, dst, src);
break;
1637 ASM(MOV64rr, dst, src);
1639 ASM(SHL64ri, dst, 64 - from);
1640 ASM(SAR64ri, dst, 64 - from);
1645template <IRAdaptor Adaptor,
1647 template <
typename,
typename,
typename>
class BaseTy,
1649void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_bfi(
1650 AsmReg dst, AsmReg src, u32 lsb, u32 width)
noexcept {
1651 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1652 assert(may_clobber_flags());
1653 ScratchReg tmp1{
this};
1654 AsmReg tmp1_reg = tmp1.alloc_gp();
1657 ASM(BTR64ri, dst, lsb);
1658 }
else if (lsb + width <= 31) {
1659 ASM(AND64ri, dst, ~(((u64{1} << width) - 1) << lsb));
1661 ASM(MOV64ri, tmp1_reg, ~(((u64{1} << width) - 1) << lsb));
1662 ASM(AND64rr, dst, tmp1_reg);
1666 ASM(MOVZXr32r8, tmp1_reg, src);
1667 }
else if (width == 16) {
1668 ASM(MOVZXr32r16, tmp1_reg, src);
1669 }
else if (width <= 32) {
1670 ASM(MOV32rr, tmp1_reg, src);
1672 ASM(AND32ri, tmp1_reg, (u32{1} << width) - 1);
1675 ASM(MOV64ri, tmp1_reg, (u64{1} << width) - 1);
1676 ASM(AND64rr, tmp1_reg, src);
1679 if (lsb >= 1 && lsb <= 3) {
1680 ASM(LEA64rm, dst, FE_MEM(dst, u8(1 << lsb), tmp1_reg, 0));
1682 if (lsb > 0 && lsb + width <= 32) {
1683 ASM(SHL32ri, tmp1_reg, lsb);
1684 }
else if (lsb > 0) {
1685 ASM(SHL64ri, tmp1_reg, lsb);
1687 ASM(OR64rr, dst, tmp1_reg);
1691template <IRAdaptor Adaptor,
1693 template <
typename,
typename,
typename>
class BaseTy,
1695void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_bfiz(
1696 AsmReg dst, AsmReg src, u32 lsb, u32 width)
noexcept {
1697 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1699 assert(may_clobber_flags());
1702 ASM(MOVZXr32r8, dst, src);
1703 }
else if (width == 16) {
1704 ASM(MOVZXr32r16, dst, src);
1705 }
else if (width <= 32) {
1706 ASM(MOV32rr, dst, src);
1708 ASM(AND32ri, dst, (u32{1} << width) - 1);
1711 ASM(MOV64ri, dst, (u64{1} << width) - 1);
1712 ASM(AND64rr, dst, src);
1715 if (lsb > 0 && lsb + width <= 32) {
1716 ASM(SHL32ri, dst, lsb);
1717 }
else if (lsb > 0) {
1718 ASM(SHL64ri, dst, lsb);
1722template <IRAdaptor Adaptor,
1724 template <
typename,
typename,
typename>
class BaseTy,
1726void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1727 set_stack_used() noexcept {
1728 if (this->compiler.stack.has_dynamic_alloca && stack_adjust_off == 0) {
1729 stack_adjust_off = this->compiler.text_writer.offset();
1731 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1732 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1736template <IRAdaptor Adaptor,
1738 template <
typename,
typename,
typename>
class BaseTy,
1740void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1741 ValuePart &vp, CCAssignment &cca)
noexcept {
1742 AsmReg ptr = vp.load_to_reg(&this->compiler);
1743 ScratchReg scratch{&this->compiler};
1744 AsmReg tmp = scratch.alloc_gp();
1746 auto size = cca.size;
1750 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1751 ASMC(&this->compiler,
1753 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1759 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1760 ASMC(&this->compiler,
1762 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1768 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1769 ASMC(&this->compiler,
1771 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1777 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1778 ASMC(&this->compiler,
1780 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1785template <IRAdaptor Adaptor,
1787 template <
typename,
typename,
typename>
class BaseTy,
1789void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1790 ValuePart &vp, CCAssignment &cca)
noexcept {
1793 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
1794 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1797 ASMC(&this->compiler,
1799 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1803 ASMC(&this->compiler,
1805 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1809 ASMC(&this->compiler,
1811 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1815 ASMC(&this->compiler,
1817 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1820 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1823 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1826 ASMC(&this->compiler,
1828 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1832 ASMC(&this->compiler,
1834 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1838 ASMC(&this->compiler,
1840 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1843 default: TPDE_UNREACHABLE(
"invalid GP reg size");
1848template <IRAdaptor Adaptor,
1850 template <
typename,
typename,
typename>
class BaseTy,
1852void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1853 std::variant<SymRef, ValuePart> &&target)
noexcept {
1854 if (this->assigner.is_vararg()) {
1855 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1856 this->compiler.evict_reg(Reg{AsmReg::AX});
1858 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1859 Config::FP_BANK, 0);
1860 unsigned xmm_cnt = 8;
1861 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1862 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1865 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1867 ASMC(&this->compiler, XOR32rr, FE_AX, FE_AX);
1872 if (stack_adjust_off != 0) {
1873 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1874 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1875 memcpy(inst_ptr + 3, &sub,
sizeof(u32));
1877 auto &max_stack_size = this->compiler.max_callee_stack_arg_size;
1878 max_stack_size = std::max(max_stack_size, this->assigner.get_stack_size());
1881 if (
auto *sym = std::get_if<SymRef>(&target)) {
1882 this->compiler.text_writer.ensure_space(16);
1883 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1884 this->compiler.reloc_text(
1885 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1887 ValuePart &tvp = std::get<ValuePart>(target);
1888 if (tvp.has_assignment() && !tvp.assignment().register_valid()) {
1889 assert(tvp.assignment().stack_valid());
1890 auto off = tvp.assignment().frame_off();
1891 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1892 }
else if (tvp.can_salvage()) {
1893 ASMC(&this->compiler, CALLr, tvp.salvage(&this->compiler));
1895 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1896 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1897 ASMC(&this->compiler, CALLr, reg);
1899 tvp.reset(&this->compiler);
1902 if (stack_adjust_off != 0) {
1903 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1907template <IRAdaptor Adaptor,
1909 template <
typename,
typename,
typename>
typename BaseTy,
1911void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1912 std::variant<SymRef, ValuePart> &&target,
1913 std::span<CallArg> arguments,
1914 typename Base::ValueRef *result,
1915 const bool variable_args) {
1916 CCAssignerSysV assigner{variable_args};
1917 CallBuilder cb{*
derived(), assigner};
1918 for (
auto &arg : arguments) {
1919 cb.add_arg(std::move(arg));
1921 cb.call(std::move(target));
1923 cb.add_ret(*result);
1927template <IRAdaptor Adaptor,
1929 template <
typename,
typename,
typename>
typename BaseTy,
1931void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1932 const AsmReg cmp_reg,
1933 const AsmReg tmp_reg,
1934 const u64 case_value,
1935 const bool width_is_32)
noexcept {
1937 ASM(CMP32ri, cmp_reg, case_value);
1939 if ((i64)((i32)case_value) == (i64)case_value) {
1940 ASM(CMP64ri, cmp_reg, case_value);
1942 this->materialize_constant(&case_value, Config::GP_BANK, 8, tmp_reg);
1943 ASM(CMP64rr, cmp_reg, tmp_reg);
1948template <IRAdaptor Adaptor,
1950 template <
typename,
typename,
typename>
typename BaseTy,
1952void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1953 const Label case_label,
1954 const AsmReg cmp_reg,
1955 const AsmReg tmp_reg,
1956 const u64 case_value,
1957 const bool width_is_32)
noexcept {
1958 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1959 generate_raw_jump(Jump::je, case_label);
1962template <IRAdaptor Adaptor,
1964 template <
typename,
typename,
typename>
typename BaseTy,
1966bool CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_jump_table(
1967 Label default_label,
1968 std::span<const Label> labels,
1973 bool width_is_32)
noexcept {
1975 if (low_bound != 0) {
1976 switch_emit_cmp(cmp_reg, tmp_reg, low_bound, width_is_32);
1977 generate_raw_jump(Jump::jb, default_label);
1979 switch_emit_cmp(cmp_reg, tmp_reg, high_bound, width_is_32);
1980 generate_raw_jump(Jump::ja, default_label);
1984 ASM(MOV32rr, cmp_reg, cmp_reg);
1987 if (low_bound != 0) {
1988 if (i32(low_bound) == i64(low_bound)) {
1989 ASM(SUB64ri, cmp_reg, low_bound);
1991 this->materialize_constant(&low_bound, Config::GP_BANK, 8, tmp_reg);
1992 ASM(SUB64rr, cmp_reg, tmp_reg);
1996 Label jump_table = this->text_writer.label_create();
1997 ASM(LEA64rm, tmp_reg, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1999 this->text_writer.label_ref(jump_table,
2000 this->text_writer.offset() - 4,
2001 LabelFixupKind::X64_JMP_OR_MEM_DISP);
2003 ASM(MOVSXr64m32, cmp_reg, FE_MEM(tmp_reg, 4, cmp_reg, 0));
2004 ASM(ADD64rr, tmp_reg, cmp_reg);
2007 this->text_writer.align(4);
2008 this->text_writer.ensure_space(4 + 4 * labels.size());
2009 this->label_place(jump_table);
2010 const u32 table_off = this->text_writer.offset();
2011 for (u32 i = 0; i < labels.size(); i++) {
2012 if (this->text_writer.label_is_pending(labels[i])) {
2013 this->text_writer.label_ref(labels[i],
2014 this->text_writer.offset(),
2015 LabelFixupKind::X64_JUMP_TABLE);
2016 this->text_writer.write(table_off);
2018 const auto label_off = this->text_writer.label_offset(labels[i]);
2019 this->text_writer.write((i32)label_off - (i32)table_off);
2025template <IRAdaptor Adaptor,
2027 template <
typename,
typename,
typename>
typename BaseTy,
2029void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
2030 const Label case_label,
2031 const Label gt_label,
2032 const AsmReg cmp_reg,
2033 const AsmReg tmp_reg,
2034 const u64 case_value,
2035 const bool width_is_32)
noexcept {
2036 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
2037 generate_raw_jump(Jump::ja, gt_label);
2040template <IRAdaptor Adaptor,
2042 template <
typename,
typename,
typename>
typename BaseTy,
2044CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
2045 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
2046 SymRef sym, TLSModel model)
noexcept {
2049 case TLSModel::GlobalDynamic: {
2052 assert(!this->stack.is_leaf_function);
2053 assert(may_clobber_flags());
2054 this->stack.generated_call =
true;
2055 auto csr = CCAssignerSysV::Info.callee_saved_regs;
2056 for (
auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
2059 ScratchReg arg{
this};
2060 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
2064 this->text_writer.ensure_space(0x10);
2065 *this->text_writer.cur_ptr()++ = 0x66;
2066 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
2067 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
2068 *this->text_writer.cur_ptr()++ = 0x66;
2069 *this->text_writer.cur_ptr()++ = 0x66;
2070 *this->text_writer.cur_ptr()++ = 0x48;
2071 ASMNC(CALL, this->text_writer.cur_ptr());
2072 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
2073 this->sym_tls_get_addr = this->assembler.sym_add_undef(
2074 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
2076 this->reloc_text(this->sym_tls_get_addr,
2078 this->text_writer.offset() - 4,
2082 ScratchReg res{
this};
2083 res.alloc_specific(AsmReg::AX);
void evict_reg(Reg reg) noexcept