TPDE
Loading...
Searching...
No Matches
CompilerX64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "AssemblerElfX64.hpp"
7#include "tpde/CompilerBase.hpp"
8#include "tpde/ValLocalIdx.hpp"
9#include "tpde/ValueAssignment.hpp"
10#include "tpde/base.hpp"
11
12#include <bit>
13
14#ifdef TPDE_ASSERTS
15 #include <fadec.h>
16#endif
17
18// Helper macros for assembling in the compiler
19#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
20 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
21#endif
22
23// Use helper, parameters might call ASM themselves => evaluate text_cur_ptr
24// after the arguments.
25#define ASM_FULL(compiler, reserve, op, ...) \
26 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
27
28#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
29#define ASMC(compiler, op, ...) \
30 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
31#define ASMF(op, flag, ...) \
32 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
33#define ASMNCF(op, flag, ...) \
34 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
35#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
36
37namespace tpde::x64 {
38
39struct AsmReg : Reg {
40 enum REG : u8 {
41 AX = 0,
42 CX,
43 DX,
44 BX,
45 SP,
46 BP,
47 SI,
48 DI,
49 R8,
50 R9,
51 R10,
52 R11,
53 R12,
54 R13,
55 R14,
56 R15,
57
58 XMM0 = 32,
59 XMM1,
60 XMM2,
61 XMM3,
62 XMM4,
63 XMM5,
64 XMM6,
65 XMM7,
66 XMM8,
67 XMM9,
68 XMM10,
69 XMM11,
70 XMM12,
71 XMM13,
72 XMM14,
73 XMM15,
74 // TODO(ts): optional support for AVX registers with compiler flag
75 };
76
77 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
78
79 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
80
81 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
82
83 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
84 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
85 }
86
87 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
88 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
89 }
90
91 constexpr operator FeRegGP() const noexcept {
92 assert(reg_id <= R15);
93 return FeRegGP{reg_id};
94 }
95
96 operator FeRegGPLH() const noexcept {
97 assert(reg_id <= R15);
98 return FeRegGP{reg_id};
99 }
100
101 constexpr operator FeRegXMM() const noexcept {
102 assert(reg_id >= XMM0 && reg_id <= XMM15);
103 return FeRegXMM{static_cast<u8>(reg_id & 0x1F)};
104 }
105};
106
107constexpr static u64
108 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
109 u64 set = 0;
110 for (const auto reg : regs) {
111 set |= 1ull << reg;
112 }
113 return set;
114}
115
116template <size_t N>
117constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
118 u64 set = 0;
119 for (const auto reg : regs) {
120 set |= 1ull << reg.id();
121 }
122 return set;
123}
124
125class CCAssignerSysV : public CCAssigner {
126public:
127 static constexpr CCInfo Info{
128 .allocatable_regs =
129 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
130 .callee_saved_regs = create_bitmask({
131 AsmReg::BX,
132 AsmReg::R12,
133 AsmReg::R13,
134 AsmReg::R14,
135 AsmReg::R15,
136 }),
137 .arg_regs = create_bitmask({
138 AsmReg::DI,
139 AsmReg::SI,
140 AsmReg::DX,
141 AsmReg::CX,
142 AsmReg::R8,
143 AsmReg::R9,
144 AsmReg::XMM0,
145 AsmReg::XMM1,
146 AsmReg::XMM2,
147 AsmReg::XMM3,
148 AsmReg::XMM4,
149 AsmReg::XMM5,
150 AsmReg::XMM6,
151 AsmReg::XMM7,
152 }),
153 };
154
155private:
156 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
157 // The next N assignments must go to the stack.
158 unsigned must_assign_stack = 0;
159 bool vararg;
160 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
161
162public:
163 CCAssignerSysV(bool vararg = false) noexcept
164 : CCAssigner(Info), vararg(vararg) {}
165
166 void reset() noexcept override {
167 gp_cnt = xmm_cnt = stack = 0;
168 must_assign_stack = 0;
169 vararg = false;
170 ret_gp_cnt = ret_xmm_cnt = 0;
171 }
172
173 void assign_arg(CCAssignment &arg) noexcept override {
174 if (arg.byval) {
175 stack = util::align_up(stack, arg.byval_align < 8 ? 8 : arg.byval_align);
176 arg.stack_off = stack;
177 stack += arg.byval_size;
178 return;
179 }
180
181 if (arg.bank == RegBank{0}) {
182 static constexpr std::array<AsmReg, 6> gp_arg_regs{
183 AsmReg::DI,
184 AsmReg::SI,
185 AsmReg::DX,
186 AsmReg::CX,
187 AsmReg::R8,
188 AsmReg::R9,
189 };
190 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
191 arg.reg = gp_arg_regs[gp_cnt];
192 gp_cnt += 1;
193 } else {
194 // Next N arguments must also be assigned to the stack
195 // Increment by one, the value is immediately decremented below.
196 must_assign_stack = arg.consecutive + 1;
197 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
198 arg.stack_off = stack;
199 stack += 8;
200 }
201 } else {
202 if (!must_assign_stack && xmm_cnt < 8) {
203 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
204 xmm_cnt += 1;
205 } else {
206 // Next N arguments must also be assigned to the stack
207 // Increment by one, the value is immediately decremented below.
208 must_assign_stack = arg.consecutive + 1;
209 u32 size = util::align_up(arg.size, 8);
210 stack = util::align_up(stack, size);
211 arg.stack_off = stack;
212 stack += size;
213 }
214 }
215
216 if (must_assign_stack > 0) {
217 must_assign_stack -= 1;
218 }
219 }
220
221 u32 get_stack_size() noexcept override { return stack; }
222
223 bool is_vararg() const noexcept override { return vararg; }
224
225 void assign_ret(CCAssignment &arg) noexcept override {
226 assert(!arg.byval && !arg.sret);
227 if (arg.bank == RegBank{0}) {
228 if (ret_gp_cnt + arg.consecutive < 2) {
229 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
230 ret_gp_cnt += 1;
231 } else {
232 assert(false);
233 }
234 } else {
235 if (ret_xmm_cnt + arg.consecutive < 2) {
236 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
237 ret_xmm_cnt += 1;
238 } else {
239 assert(false);
240 }
241 }
242 }
243};
244
245struct PlatformConfig : CompilerConfigDefault {
246 using Assembler = AssemblerElfX64;
247 using AsmReg = tpde::x64::AsmReg;
248 using DefaultCCAssigner = CCAssignerSysV;
249
250 static constexpr RegBank GP_BANK{0};
251 static constexpr RegBank FP_BANK{1};
252 static constexpr bool FRAME_INDEXING_NEGATIVE = true;
253 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
254 static constexpr u32 NUM_BANKS = 2;
255};
256
257namespace concepts {
258template <typename T, typename Config>
259concept Compiler = tpde::Compiler<T, Config> && requires(T a) {
260 {
261 a.arg_is_int128(std::declval<typename T::IRValueRef>())
262 } -> std::convertible_to<bool>;
263
264 {
265 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
266 } -> std::convertible_to<bool>;
267};
268} // namespace concepts
269
270template <IRAdaptor Adaptor,
271 typename Derived,
272 template <typename, typename, typename> typename BaseTy =
273 CompilerBase,
274 typename Config = PlatformConfig>
275struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
276 using Base = BaseTy<Adaptor, Derived, Config>;
277
278 using IRValueRef = typename Base::IRValueRef;
279 using IRBlockRef = typename Base::IRBlockRef;
280 using IRFuncRef = typename Base::IRFuncRef;
281
282 using ScratchReg = typename Base::ScratchReg;
283 using ValuePartRef = typename Base::ValuePartRef;
284 using ValuePart = typename Base::ValuePart;
285 using GenericValuePart = typename Base::GenericValuePart;
286
287 using Assembler = typename PlatformConfig::Assembler;
288 using RegisterFile = typename Base::RegisterFile;
289
290 using CallArg = typename Base::CallArg;
291
292 using Base::derived;
293
294
295 // TODO(ts): make this dependent on the number of callee-saved regs of the
296 // current function or if there is a call in the function?
297 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
298 6};
299
300 enum CPU_FEATURES : u32 {
301 CPU_BASELINE = 0, // x86-64-v1
302 CPU_CMPXCHG16B = (1 << 0),
303 CPU_POPCNT = (1 << 1),
304 CPU_SSE3 = (1 << 2),
305 CPU_SSSE3 = (1 << 3),
306 CPU_SSE4_1 = (1 << 4),
307 CPU_SSE4_2 = (1 << 5),
308 CPU_AVX = (1 << 6),
309 CPU_AVX2 = (1 << 7),
310 CPU_BMI1 = (1 << 8),
311 CPU_BMI2 = (1 << 9),
312 CPU_F16C = (1 << 10),
313 CPU_FMA = (1 << 11),
314 CPU_LZCNT = (1 << 12),
315 CPU_MOVBE = (1 << 13),
316 CPU_AVX512F = (1 << 14),
317 CPU_AVX512BW = (1 << 15),
318 CPU_AVX512CD = (1 << 16),
319 CPU_AVX512DQ = (1 << 17),
320 CPU_AVX512VL = (1 << 18),
321
322 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
323 CPU_SSE4_1 | CPU_SSE4_2,
324 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
325 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
326 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
327 CPU_AVX512VL,
328 };
329
330 CPU_FEATURES cpu_feats = CPU_BASELINE;
331
332 // When handling function arguments, we need to prevent argument registers
333 // from being handed out as fixed registers
334 //
335 // Additionally, for now we prevent AX,DX,CX to be fixed to not run into
336 // issues with instructions that need them as implicit arguments
337 // also AX and DX can never be fixed if exception handling is used
338 // since they are clobbered there
339 u64 fixed_assignment_nonallocatable_mask =
340 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
341 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
342 func_reg_restore_alloc = 0u;
343 /// Offset to the `sub rsp, XXX` instruction that sets up the frame
344 u32 frame_size_setup_offset = 0u;
345 /// For vararg functions only: number of scalar and xmm registers used.
346 // TODO: this information should be obtained from the CCAssigner.
347 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
348 u32 reg_save_frame_off = 0;
349 u32 var_arg_stack_off = 0;
350 util::SmallVector<u32, 8> func_ret_offs = {};
351
352 /// Symbol for __tls_get_addr.
353 Assembler::SymRef sym_tls_get_addr;
354
355 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
356 u32 stack_adjust_off = 0;
357
358 void set_stack_used() noexcept;
359
360 public:
361 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
362 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
363
364 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
365 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
366 void call_impl(
367 std::variant<typename Assembler::SymRef, ValuePart> &&target) noexcept;
368 void reset_stack() noexcept;
369 };
370
371 // for now, always generate an object
372 explicit CompilerX64(Adaptor *adaptor,
373 const CPU_FEATURES cpu_features = CPU_BASELINE)
374 : Base{adaptor}, cpu_feats(cpu_features) {
375 static_assert(std::is_base_of_v<CompilerX64, Derived>);
376 static_assert(concepts::Compiler<Derived, PlatformConfig>);
377 }
378
379 template <typename... Args>
380 auto asm_helper(unsigned (*enc_fn)(u8 *, int, Args...)) {
381 struct Helper {
382 CompilerX64 *compiler;
383 decltype(enc_fn) fn;
384 void encode(unsigned reserve, int flags, Args... args) {
385 if (reserve) {
386 compiler->text_writer.ensure_space(reserve);
387 }
388 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
389 assert(n != 0);
390 compiler->text_writer.cur_ptr() += n;
391 }
392 };
393 return Helper{this, enc_fn};
394 }
395
396 void start_func(u32 func_idx) noexcept;
397
398 void gen_func_prolog_and_args(CCAssigner *) noexcept;
399
400 void finish_func(u32 func_idx) noexcept;
401
402 void reset() noexcept;
403
404 // helpers
405
406 void gen_func_epilog() noexcept;
407
408 void
409 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
410
411 void load_from_stack(AsmReg dst,
412 i32 frame_off,
413 u32 size,
414 bool sign_extend = false) noexcept;
415
416 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
417
418 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
419
420 GenericValuePart val_spill_slot(ValuePart &val_ref) noexcept {
421 const auto ap = val_ref.assignment();
422 assert(ap.stack_valid() && !ap.variable_ref());
423 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
424 }
425
426 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
427
428 void materialize_constant(const u64 *data,
429 RegBank bank,
430 u32 size,
431 AsmReg dst) noexcept;
432
433 AsmReg select_fixed_assignment_reg(RegBank bank, IRValueRef) noexcept;
434
435 enum class Jump {
436 ja,
437 jae,
438 jb,
439 jbe,
440 je,
441 jg,
442 jge,
443 jl,
444 jle,
445 jmp,
446 jne,
447 jno,
448 jo,
449 js,
450 jns,
451 jp,
452 jnp,
453 };
454
455 Jump invert_jump(Jump jmp) noexcept;
456 Jump swap_jump(Jump jmp) noexcept;
457
458 void generate_branch_to_block(Jump jmp,
459 IRBlockRef target,
460 bool needs_split,
461 bool last_inst) noexcept;
462
463 void generate_raw_jump(Jump jmp, Assembler::Label target) noexcept;
464
465 /// Set dst to 1 if cc is true, otherwise set it to zero
466 void generate_raw_set(Jump cc, AsmReg dst) noexcept;
467 /// Set all bits of dst to 1 if cc is true, otherwise set it to zero
468 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
469 /// Move src into dst if cc is true, otherwise do nothing
470 void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept;
471
472 void generate_raw_intext(
473 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
474
475 /// Generate a function call
476 ///
477 /// This will get the arguments into the correct registers according to the
478 /// calling convention, clear non-callee-saved registers from the register
479 /// file (make sure you do not have any fixed assignments left over) and
480 /// fill the result registers (the u8 in the ScratchReg pair indicates the
481 /// register bank)
482 ///
483 /// Targets can be a symbol (call to PLT with relocation), or an indirect
484 /// call to a ValuePart. Result is an optional reference.
485 void generate_call(std::variant<Assembler::SymRef, ValuePart> &&target,
486 std::span<CallArg> arguments,
487 typename Base::ValueRef *result,
488 bool variable_args = false);
489
490 /// Generate code sequence to load address of sym into a register. This will
491 /// generate a function call for dynamic TLS access models.
492 ScratchReg tls_get_addr(Assembler::SymRef sym, TLSModel model) noexcept;
493
494 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
495 return ((cpu_feats & feats) == feats);
496 }
497};
498
499template <IRAdaptor Adaptor,
500 typename Derived,
501 template <typename, typename, typename> class BaseTy,
502 typename Config>
503void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
504 const u32 /*func_idx*/) noexcept {
505 this->text_writer.align(16);
506 this->assembler.except_begin_func();
507}
508
509template <IRAdaptor Adaptor,
510 typename Derived,
511 template <typename, typename, typename> typename BaseTy,
512 typename Config>
513void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
514 CCAssigner *cc_assigner) noexcept {
515 // prologue:
516 // push rbp
517 // mov rbp, rsp
518 // optionally create vararg save-area
519 // reserve space for callee-saved regs
520 // = 1 byte for each of the lower 8 regs and 2
521 // bytes for the higher 8 regs
522 // sub rsp, #<frame_size>+<largest_call_frame_usage>
523
524 // TODO(ts): technically we only need rbp if there
525 // is a dynamic alloca but then we need to make the
526 // frame indexing dynamic in CompilerBase and the
527 // unwind info needs to take the dynamic sub rsp for
528 // calls into account
529
530 func_ret_offs.clear();
531 func_start_off = this->text_writer.offset();
532 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
533
534 const CCInfo &cc_info = cc_assigner->get_ccinfo();
535
536 ASM(PUSHr, FE_BP);
537 ASM(MOV64rr, FE_BP, FE_SP);
538
539 func_reg_save_off = this->text_writer.offset();
540
541 auto csr = cc_info.callee_saved_regs;
542 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
543 "non-gp callee-saved registers not implemented");
544
545 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
546 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
547 // R8 and higher need a REX prefix.
548 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
549 this->stack.frame_size = 8 * (csr_logp + csr_higp);
550
551 this->text_writer.ensure_space(reg_save_size);
552 this->text_writer.cur_ptr() += reg_save_size;
553 func_reg_save_alloc = reg_save_size;
554 // pop uses the same amount of bytes as push
555 func_reg_restore_alloc = reg_save_size;
556
557 // TODO(ts): support larger stack alignments?
558
559 // placeholder for later
560 frame_size_setup_offset = this->text_writer.offset();
561 ASM(SUB64ri, FE_SP, 0x7FFF'FFFF);
562#ifdef TPDE_ASSERTS
563 assert((this->text_writer.offset() - frame_size_setup_offset) == 7);
564#endif
565
566 if (this->adaptor->cur_is_vararg()) {
567 this->stack.frame_size += 6 * 8 + 8 * 16;
568 reg_save_frame_off = this->stack.frame_size;
569 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
570 ASM(MOV64mr, mem, FE_DI);
571 mem.off += 8;
572 ASM(MOV64mr, mem, FE_SI);
573 mem.off += 8;
574 ASM(MOV64mr, mem, FE_DX);
575 mem.off += 8;
576 ASM(MOV64mr, mem, FE_CX);
577 mem.off += 8;
578 ASM(MOV64mr, mem, FE_R8);
579 mem.off += 8;
580 ASM(MOV64mr, mem, FE_R9);
581 auto skip_fp = this->assembler.label_create();
582 ASM(TEST8rr, FE_AX, FE_AX);
583 generate_raw_jump(Jump::je, skip_fp);
584 mem.off += 8;
585 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
586 mem.off += 16;
587 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
588 mem.off += 16;
589 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
590 mem.off += 16;
591 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
592 mem.off += 16;
593 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
594 mem.off += 16;
595 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
596 mem.off += 16;
597 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
598 mem.off += 16;
599 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
600 this->label_place(skip_fp);
601 }
602
603 // Temporarily prevent argument registers from being assigned.
604 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
605 "argument registers must also be allocatable");
606 this->register_file.allocatable &= ~cc_info.arg_regs;
607
608 u32 arg_idx = 0;
609 for (const IRValueRef arg : this->adaptor->cur_args()) {
610 derived()->handle_func_arg(
611 arg_idx,
612 arg,
613 [&](ValuePart &&vp, CCAssignment cca) -> std::optional<i32> {
614 cca.bank = vp.bank();
615 cca.size = vp.part_size();
616
617 cc_assigner->assign_arg(cca);
618
619 if (cca.reg.valid()) [[likely]] {
620 vp.set_value_reg(this, cca.reg);
621 // Mark register as allocatable as soon as it is assigned. If the
622 // argument is unused, the register will be freed immediately and
623 // can be used for later stack arguments.
624 this->register_file.allocatable |= u64{1} << cca.reg.id();
625 return {};
626 }
627
628 if (vp.is_owned()) {
629 // no need to handle unused arguments
630 return {};
631 }
632
633 if (cca.byval) {
634 // Return byval frame_off.
635 return 0x10 + cca.stack_off;
636 } else {
637 // TODO(ts): maybe allow negative frame offsets for value
638 // assignments so we can simply reference this?
639 // but this probably doesn't work with multi-part values
640 // since the offsets are different
641 AsmReg dst = vp.alloc_reg(this);
642 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
643 }
644 return {};
645 });
646
647 arg_idx += 1;
648 }
649
650 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
651 // TODO: get this from CCAssigner?
652 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
653 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
654 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
655 this->scalar_arg_count = std::popcount(gp_regs);
656 this->vec_arg_count = std::popcount(xmm_regs);
657 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
658 }
659
660 this->register_file.allocatable |= cc_info.arg_regs;
661}
662
663template <IRAdaptor Adaptor,
664 typename Derived,
665 template <typename, typename, typename> typename BaseTy,
666 typename Config>
667void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
668 u32 func_idx) noexcept {
669 // NB: code alignment factor 1, data alignment factor -8.
670 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
671 // push rbp
672 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
673 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
674 this->assembler.eh_write_inst(
675 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
676 // mov rbp, rsp
677 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
678 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
679 dwarf::x64::DW_reg_rbp);
680
681 // Patched below
682 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
683 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
684
685 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
686 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
687 u64 saved_regs = this->register_file.clobbered & csr;
688 u32 num_saved_regs = 0u;
689 for (auto reg : util::BitSetIterator{saved_regs}) {
690 assert(reg <= AsmReg::R15);
691 write_ptr +=
692 fe64_PUSHr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
693 ++num_saved_regs;
694
695 // DWARF register ordering is subtly different from the encoding:
696 // x86 is: ax, cx, dx, bx, sp, bp, si, di, r8, ...
697 // DWARF is: ax, dx, cx, bx, si, di, bp, sp, r8, ...
698 static const u8 gpreg_to_dwarf[] = {
699 dwarf::x64::DW_reg_rax,
700 dwarf::x64::DW_reg_rcx,
701 dwarf::x64::DW_reg_rdx,
702 dwarf::x64::DW_reg_rbx,
703 dwarf::x64::DW_reg_rsp,
704 dwarf::x64::DW_reg_rbp,
705 dwarf::x64::DW_reg_rsi,
706 dwarf::x64::DW_reg_rdi,
707 dwarf::x64::DW_reg_r8,
708 dwarf::x64::DW_reg_r9,
709 dwarf::x64::DW_reg_r10,
710 dwarf::x64::DW_reg_r11,
711 dwarf::x64::DW_reg_r12,
712 dwarf::x64::DW_reg_r13,
713 dwarf::x64::DW_reg_r14,
714 dwarf::x64::DW_reg_r15,
715 };
716 u8 dwarf_reg = gpreg_to_dwarf[reg];
717 auto cfa_off = num_saved_regs + 2;
718 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
719 }
720
721 u32 prologue_size =
722 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
723 assert(prologue_size < 0x44);
724 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
725 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
726
727 // The frame_size contains the reserved frame size so we need to subtract
728 // the stack space we used for the saved registers
729 const auto final_frame_size =
730 util::align_up(this->stack.frame_size, 16) - num_saved_regs * 8;
731 *reinterpret_cast<u32 *>(this->text_writer.begin_ptr() +
732 frame_size_setup_offset + 3) = final_frame_size;
733#ifdef TPDE_ASSERTS
734 FdInstr instr = {};
735 assert(fd_decode(this->text_writer.begin_ptr() + frame_size_setup_offset,
736 7,
737 64,
738 0,
739 &instr) == 7);
740 assert(FD_TYPE(&instr) == FDI_SUB);
741 assert(FD_OP_TYPE(&instr, 0) == FD_OT_REG);
742 assert(FD_OP_TYPE(&instr, 1) == FD_OT_IMM);
743 assert(FD_OP_SIZE(&instr, 0) == 8);
744 assert(FD_OP_SIZE(&instr, 1) == 8);
745 assert(FD_OP_IMM(&instr, 1) == final_frame_size);
746#endif
747
748 // nop out the rest
749 const auto reg_save_end =
750 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
751 assert(reg_save_end >= write_ptr);
752 const u32 nop_len = reg_save_end - write_ptr;
753 if (nop_len) {
754 fe64_NOP(write_ptr, nop_len);
755 }
756
757 auto func_sym = this->func_syms[func_idx];
758 auto func_sec = this->text_writer.get_sec_ref();
759 if (func_ret_offs.empty()) {
760 // TODO(ts): honor cur_needs_unwind_info
761 auto func_size = this->text_writer.offset() - func_start_off;
762 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
763 this->assembler.eh_end_fde(fde_off, func_sym);
764 this->assembler.except_encode_func(func_sym);
765 return;
766 }
767
768 auto *text_data = this->text_writer.begin_ptr();
769 u32 first_ret_off = func_ret_offs[0];
770 u32 ret_size = 0;
771 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc; // add + pop + ret
772 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
773 {
774 write_ptr = text_data + first_ret_off;
775 const auto ret_start = write_ptr;
776 if (this->adaptor->cur_has_dynamic_alloca()) {
777 if (num_saved_regs == 0) {
778 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
779 } else {
780 write_ptr +=
781 fe64_LEA64rm(write_ptr,
782 0,
783 FE_SP,
784 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
785 }
786 } else {
787 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, final_frame_size);
788 }
789 for (auto reg : util::BitSetIterator<true>{saved_regs}) {
790 assert(reg <= AsmReg::R15);
791 write_ptr +=
792 fe64_POPr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
793 }
794 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
795 write_ptr += fe64_RET(write_ptr, 0);
796 ret_size = write_ptr - ret_start;
797 assert(ret_size <= epilogue_size && "function epilogue too long");
798
799 // write NOP for better disassembly
800 if (epilogue_size > ret_size) {
801 fe64_NOP(write_ptr, epilogue_size - ret_size);
802 if (first_ret_off == func_end_ret_off) {
803 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
804 }
805 }
806 }
807
808 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
809 std::memcpy(
810 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
811 if (func_ret_offs[i] == func_end_ret_off) {
812 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
813 }
814 }
815
816 // Do sym_def at the very end; we shorten the function here again, so only at
817 // this point we know the actual size of the function.
818 // TODO(ts): honor cur_needs_unwind_info
819 auto func_size = this->text_writer.offset() - func_start_off;
820 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
821 this->assembler.eh_end_fde(fde_off, func_sym);
822 this->assembler.except_encode_func(func_sym);
823}
824
825template <IRAdaptor Adaptor,
826 typename Derived,
827 template <typename, typename, typename> typename BaseTy,
828 typename Config>
829void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
830 func_ret_offs.clear();
831 sym_tls_get_addr = {};
832 Base::reset();
833}
834
835template <IRAdaptor Adaptor,
836 typename Derived,
837 template <typename, typename, typename> typename BaseTy,
838 typename Config>
839void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
840 // epilogue:
841 // if !func_has_dynamic_alloca:
842 // add rsp, #<frame_size>+<largest_call_frame_usage>
843 // else:
844 // lea rsp, [rbp - <size_of_reg_save_area>]
845 // for each saved reg:
846 // pop <reg>
847 // pop rbp
848 // ret
849 //
850 // however, since we will later patch this, we only
851 // reserve the space for now
852
853 func_ret_offs.push_back(this->text_writer.offset());
854
855 // add reg, imm32
856 // and
857 // lea rsp, [rbp - imm32]
858 // both take 7 bytes
859 u32 epilogue_size =
860 7 + 1 + 1 +
861 func_reg_restore_alloc; // add/lea + pop + ret + size of reg restore
862
863 this->text_writer.ensure_space(epilogue_size);
864 this->text_writer.cur_ptr() += epilogue_size;
865}
866
867template <IRAdaptor Adaptor,
868 typename Derived,
869 template <typename, typename, typename> typename BaseTy,
870 typename Config>
871void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
872 const AsmReg reg, const i32 frame_off, const u32 size) noexcept {
873 this->text_writer.ensure_space(16);
874 assert(frame_off < 0);
875 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
876 if (reg.id() <= AsmReg::R15) {
877 switch (size) {
878 case 1: ASMNC(MOV8mr, mem, reg); break;
879 case 2: ASMNC(MOV16mr, mem, reg); break;
880 case 4: ASMNC(MOV32mr, mem, reg); break;
881 case 8: ASMNC(MOV64mr, mem, reg); break;
882 default: TPDE_UNREACHABLE("invalid spill size");
883 }
884 return;
885 }
886
887 switch (size) {
888 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg); break;
889 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg); break;
890 case 16: ASMNC(SSE_MOVAPDmr, mem, reg); break;
891 default: TPDE_UNREACHABLE("invalid spill size");
892 }
893}
894
895template <IRAdaptor Adaptor,
896 typename Derived,
897 template <typename, typename, typename> typename BaseTy,
898 typename Config>
899void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
900 const AsmReg dst,
901 const i32 frame_off,
902 const u32 size,
903 const bool sign_extend) noexcept {
904 this->text_writer.ensure_space(16);
905 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
906
907 if (dst.id() <= AsmReg::R15) {
908 if (!sign_extend) {
909 switch (size) {
910 case 1: ASMNC(MOVZXr32m8, dst, mem); break;
911 case 2: ASMNC(MOVZXr32m16, dst, mem); break;
912 case 4: ASMNC(MOV32rm, dst, mem); break;
913 case 8: ASMNC(MOV64rm, dst, mem); break;
914 default: TPDE_UNREACHABLE("invalid spill size");
915 }
916 } else {
917 switch (size) {
918 case 1: ASMNC(MOVSXr64m8, dst, mem); break;
919 case 2: ASMNC(MOVSXr64m16, dst, mem); break;
920 case 4: ASMNC(MOVSXr64m32, dst, mem); break;
921 case 8: ASMNC(MOV64rm, dst, mem); break;
922 default: TPDE_UNREACHABLE("invalid spill size");
923 }
924 }
925 return;
926 }
927
928 assert(!sign_extend);
929
930 switch (size) {
931 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem); break;
932 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem); break;
933 case 16: ASMNC(SSE_MOVAPDrm, dst, mem); break;
934 default: TPDE_UNREACHABLE("invalid spill size");
935 }
936}
937
938template <IRAdaptor Adaptor,
939 typename Derived,
940 template <typename, typename, typename> typename BaseTy,
941 typename Config>
942void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
943 const AsmReg dst, const AssignmentPartRef ap) noexcept {
944 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
945}
946
947template <IRAdaptor Adaptor,
948 typename Derived,
949 template <typename, typename, typename> typename BaseTy,
950 typename Config>
951void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
952 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
953 assert(dst.valid());
954 assert(src.valid());
955 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
956 if (size > 4) {
957 ASM(MOV64rr, dst, src);
958 } else {
959 ASM(MOV32rr, dst, src);
960 }
961 } else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
962 if (size <= 16) {
963 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
964 assert(has_cpu_feats(CPU_AVX512F));
965 ASM(VMOVAPD128rr, dst, src);
966 } else {
967 ASM(SSE_MOVAPDrr, dst, src);
968 }
969 } else if (size <= 32) {
970 assert(has_cpu_feats(CPU_AVX));
971 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
972 has_cpu_feats(CPU_AVX512F));
973 ASM(VMOVAPD256rr, dst, src);
974 } else {
975 assert(size <= 64);
976 assert(has_cpu_feats(CPU_AVX512F));
977 ASM(VMOVAPD512rr, dst, src);
978 }
979 } else if (dst.id() <= AsmReg::R15) {
980 // gp<-xmm
981 assert(src.id() >= AsmReg::XMM0);
982 assert(size <= 8);
983 if (src.id() > AsmReg::XMM15) {
984 assert(has_cpu_feats(CPU_AVX512F));
985 if (size <= 4) {
986 ASM(VMOVD_X2Grr, dst, src);
987 } else {
988 ASM(VMOVQ_X2Grr, dst, src);
989 }
990 } else {
991 if (size <= 4) {
992 ASM(SSE_MOVD_X2Grr, dst, src);
993 } else {
994 ASM(SSE_MOVQ_X2Grr, dst, src);
995 }
996 }
997 } else {
998 // xmm<-gp
999 assert(src.id() <= AsmReg::R15);
1000 assert(dst.id() >= AsmReg::XMM0);
1001 assert(size <= 8);
1002 if (dst.id() > AsmReg::XMM15) {
1003 assert(has_cpu_feats(CPU_AVX512F));
1004 if (size <= 4) {
1005 ASM(VMOVD_G2Xrr, dst, src);
1006 } else {
1007 ASM(VMOVQ_G2Xrr, dst, src);
1008 }
1009 } else {
1010 if (size <= 4) {
1011 ASM(SSE_MOVD_G2Xrr, dst, src);
1012 } else {
1013 ASM(SSE_MOVQ_G2Xrr, dst, src);
1014 }
1015 }
1016 }
1017}
1018
1019template <IRAdaptor Adaptor,
1020 typename Derived,
1021 template <typename, typename, typename> typename BaseTy,
1022 typename Config>
1023AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1024 GenericValuePart &gv) noexcept {
1025 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1026
1027 ScratchReg scratch{derived()};
1028 bool disp32 = i32(expr.disp) == expr.disp;
1029 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1030 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1031 if (std::holds_alternative<ScratchReg>(expr.base)) {
1032 scratch = std::move(std::get<ScratchReg>(expr.base));
1033 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1034 scratch = std::move(std::get<ScratchReg>(expr.index));
1035 } else {
1036 (void)scratch.alloc_gp();
1037 }
1038 auto dst = scratch.cur_reg();
1039 if (idx.valid()) {
1040 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1041 u8 sc = expr.scale;
1042 if (base.valid() && disp32) {
1043 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1044 expr.disp = 0;
1045 } else if (base.valid()) {
1046 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1047 } else if (disp32) {
1048 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1049 } else {
1050 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1051 }
1052 } else {
1053 u64 scale = expr.scale;
1054 if (base == idx) {
1055 base = AsmReg::make_invalid();
1056 scale += 1;
1057 }
1058
1059 ScratchReg idx_scratch{derived()};
1060 // We need a register to compute the scaled index.
1061 AsmReg idx_tmp = dst;
1062 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1063 // We can't use dst, it'd clobber base, so use the other
1064 // register we currently own.
1065 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1066 } else if (dst == base) {
1067 idx_tmp = idx_scratch.alloc_gp();
1068 }
1069
1070 if ((scale & (scale - 1)) == 0) {
1071 if (idx_tmp != idx) {
1072 ASM(MOV64rr, idx_tmp, idx);
1073 }
1074 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1075 } else {
1076 if (i32(scale) == i64(scale)) {
1077 ASM(IMUL64rri, idx_tmp, idx, scale);
1078 } else {
1079 ScratchReg scratch2{derived()};
1080 auto tmp2 = scratch2.alloc_gp();
1081 ASM(MOV64ri, tmp2, scale);
1082 if (idx_tmp != idx) {
1083 ASM(MOV64rr, idx_tmp, idx);
1084 }
1085 ASM(IMUL64rr, idx_tmp, tmp2);
1086 }
1087 }
1088 if (base.valid()) {
1089 if (disp32 || (idx_tmp != dst && base != dst)) {
1090 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1091 expr.disp = 0;
1092 } else if (dst == base) {
1093 ASM(ADD64rr, dst, idx_tmp);
1094 } else {
1095 ASM(ADD64rr, dst, base);
1096 }
1097 }
1098 }
1099 } else if (base.valid()) {
1100 if (expr.disp && disp32) {
1101 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1102 expr.disp = 0;
1103 } else if (dst != base) {
1104 ASM(MOV64rr, dst, base);
1105 }
1106 }
1107 if (expr.disp) {
1108 ScratchReg scratch2{derived()};
1109 auto tmp2 = scratch2.alloc_gp();
1110 ASM(MOV64ri, tmp2, expr.disp);
1111 ASM(ADD64rr, dst, tmp2);
1112 }
1113 gv.state = std::move(scratch);
1114 return dst;
1115}
1116
1117template <IRAdaptor Adaptor,
1118 typename Derived,
1119 template <typename, typename, typename> typename BaseTy,
1120 typename Config>
1121void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1122 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1123 const auto const_u64 = data[0];
1124 if (bank == Config::GP_BANK) {
1125 assert(size <= 8);
1126 if (const_u64 == 0) {
1127 // note: cannot use XOR here since this might be called in-between
1128 // instructions that rely on the flags being preserved
1129 // ASM(XOR32rr, dst, dst);
1130 ASM(MOV32ri, dst, 0);
1131 return;
1132 }
1133
1134 if (size <= 4) {
1135 ASM(MOV32ri, dst, const_u64);
1136 } else {
1137 ASM(MOV64ri, dst, const_u64);
1138 }
1139 return;
1140 }
1141
1142 assert(bank == Config::FP_BANK);
1143 const auto high_u64 = size <= 8 ? 0 : data[1];
1144 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1145 if (has_cpu_feats(CPU_AVX)) {
1146 ASM(VPXOR128rrr, dst, dst, dst);
1147 } else {
1148 ASM(SSE_PXORrr, dst, dst);
1149 }
1150 return;
1151 }
1152 const u64 ones = -u64{1};
1153 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1154 if (has_cpu_feats(CPU_AVX)) {
1155 ASM(VPCMPEQB128rrr, dst, dst, dst);
1156 } else {
1157 ASM(SSE_PCMPEQBrr, dst, dst);
1158 }
1159 return;
1160 }
1161
1162 if (size <= 8) {
1163 // We must not evict registers here (might be used within branching code),
1164 // so only use free registers and load from memory otherwise.
1165 AsmReg tmp =
1166 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1167 if (tmp.valid()) {
1168 this->register_file.mark_clobbered(tmp);
1169 materialize_constant(data, Config::GP_BANK, size, tmp);
1170 if (size <= 4) {
1171 if (has_cpu_feats(CPU_AVX)) {
1172 ASM(VMOVD_G2Xrr, dst, tmp);
1173 } else {
1174 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1175 }
1176 } else {
1177 if (has_cpu_feats(CPU_AVX)) {
1178 ASM(VMOVQ_G2Xrr, dst, tmp);
1179 } else {
1180 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1181 }
1182 }
1183 return;
1184 }
1185 }
1186
1187 // TODO: round to next power of two but at least 4 byte
1188 // We store constants in 8-byte units.
1189 auto alloc_size = util::align_up(size, 8);
1190 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), alloc_size};
1191 // TODO: deduplicate/pool constants?
1192 auto rodata = this->assembler.get_data_section(true, false);
1193 auto sym = this->assembler.sym_def_data(
1194 rodata, "", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1195 if (size <= 4) {
1196 if (has_cpu_feats(CPU_AVX)) {
1197 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1198 } else {
1199 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1200 }
1201 } else if (size <= 8) {
1202 if (has_cpu_feats(CPU_AVX)) {
1203 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1204 } else {
1205 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1206 }
1207 } else if (size <= 16) {
1208 if (has_cpu_feats(CPU_AVX)) {
1209 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1210 } else {
1211 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1212 }
1213 } else {
1214 // TODO: implement for AVX/AVX-512.
1215 TPDE_FATAL("unable to materialize constant");
1216 }
1217
1218 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1219}
1220
1221template <IRAdaptor Adaptor,
1222 typename Derived,
1223 template <typename, typename, typename> typename BaseTy,
1224 typename Config>
1225AsmReg
1226 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1227 const RegBank bank, IRValueRef) noexcept {
1228 assert(bank.id() <= Config::NUM_BANKS);
1229 auto reg_mask = this->register_file.bank_regs(bank);
1230 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1231
1232 const auto find_possible_regs = [this,
1233 reg_mask](const u64 preferred_regs) -> u64 {
1234 // try to first get an unused reg, otherwise an unfixed reg
1235 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1236 u64 possible_regs = free_regs & preferred_regs & reg_mask;
1237 if (possible_regs == 0) {
1238 possible_regs = (this->register_file.used & ~this->register_file.fixed) &
1239 preferred_regs & reg_mask;
1240 }
1241 return possible_regs;
1242 };
1243
1244 u64 possible_regs;
1245 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1246 if (derived()->cur_func_may_emit_calls()) {
1247 // we can only allocated fixed assignments from the callee-saved regs
1248 possible_regs = find_possible_regs(csr);
1249 } else {
1250 // try allocating any non-callee saved register first, except the result
1251 // registers
1252 possible_regs = find_possible_regs(~csr);
1253 if (possible_regs == 0) {
1254 // otherwise fallback to callee-saved regs
1255 possible_regs = find_possible_regs(csr);
1256 }
1257 }
1258
1259 if (possible_regs == 0) {
1260 return AsmReg::make_invalid();
1261 }
1262
1263 // try to first get an unused reg, otherwise an unfixed reg
1264 if ((possible_regs & ~this->register_file.used) != 0) {
1265 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1266 }
1267
1268 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1269 const auto reg = AsmReg{reg_id};
1270
1271 if (this->register_file.is_fixed(reg)) {
1272 continue;
1273 }
1274
1275 const auto local_idx = this->register_file.reg_local_idx(reg);
1276 const auto part = this->register_file.reg_part(reg);
1277
1278 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1279 continue;
1280 }
1281 auto *assignment = this->val_assignment(local_idx);
1282 auto ap = AssignmentPartRef{assignment, part};
1283 if (ap.modified()) {
1284 continue;
1285 }
1286
1287 return reg;
1288 }
1289
1290 return AsmReg::make_invalid();
1291}
1292
1293template <IRAdaptor Adaptor,
1294 typename Derived,
1295 template <typename, typename, typename> typename BaseTy,
1296 typename Config>
1297typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1298 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1299 Jump jmp) noexcept {
1300 switch (jmp) {
1301 case Jump::ja: return Jump::jbe;
1302 case Jump::jae: return Jump::jb;
1303 case Jump::jb: return Jump::jae;
1304 case Jump::jbe: return Jump::ja;
1305 case Jump::je: return Jump::jne;
1306 case Jump::jg: return Jump::jle;
1307 case Jump::jge: return Jump::jl;
1308 case Jump::jl: return Jump::jge;
1309 case Jump::jle: return Jump::jg;
1310 case Jump::jmp: return Jump::jmp;
1311 case Jump::jne: return Jump::je;
1312 case Jump::jno: return Jump::jo;
1313 case Jump::jo: return Jump::jno;
1314 case Jump::js: return Jump::jns;
1315 case Jump::jns: return Jump::js;
1316 case Jump::jp: return Jump::jnp;
1317 case Jump::jnp: return Jump::jp;
1318 default: TPDE_UNREACHABLE("invalid jump condition");
1319 }
1320}
1321
1322template <IRAdaptor Adaptor,
1323 typename Derived,
1324 template <typename, typename, typename> class BaseTy,
1325 typename Config>
1326typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1327 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1328 Jump jmp) noexcept {
1329 switch (jmp) {
1330 case Jump::ja: return Jump::jb;
1331 case Jump::jae: return Jump::jbe;
1332 case Jump::jb: return Jump::ja;
1333 case Jump::jbe: return Jump::jae;
1334 case Jump::je: return Jump::je;
1335 case Jump::jg: return Jump::jl;
1336 case Jump::jge: return Jump::jle;
1337 case Jump::jl: return Jump::jg;
1338 case Jump::jle: return Jump::jge;
1339 case Jump::jmp: return Jump::jmp;
1340 case Jump::jne: return Jump::jne;
1341 case Jump::jno: return Jump::jno;
1342 case Jump::jo: return Jump::jo;
1343 case Jump::js: return Jump::js;
1344 case Jump::jns: return Jump::jns;
1345 case Jump::jp: return Jump::jp;
1346 case Jump::jnp: return Jump::jnp;
1347 default: TPDE_UNREACHABLE("invalid jump condition");
1348 }
1349}
1350
1351template <IRAdaptor Adaptor,
1352 typename Derived,
1353 template <typename, typename, typename> typename BaseTy,
1354 typename Config>
1355void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1356 const Jump jmp,
1357 IRBlockRef target,
1358 const bool needs_split,
1359 const bool last_inst) noexcept {
1360 const auto target_idx = this->analyzer.block_idx(target);
1361 if (!needs_split || jmp == Jump::jmp) {
1362 this->derived()->move_to_phi_nodes(target_idx);
1363
1364 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1365 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1366 }
1367 } else {
1368 auto tmp_label = this->assembler.label_create();
1369 generate_raw_jump(invert_jump(jmp), tmp_label);
1370
1371 this->derived()->move_to_phi_nodes(target_idx);
1372
1373 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1374
1375 this->label_place(tmp_label);
1376 }
1377}
1378
1379template <IRAdaptor Adaptor,
1380 typename Derived,
1381 template <typename, typename, typename> typename BaseTy,
1382 typename Config>
1383void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1384 Jump jmp, Assembler::Label target_label) noexcept {
1385 if (this->assembler.label_is_pending(target_label)) {
1386 this->text_writer.ensure_space(6);
1387 auto *target = this->text_writer.cur_ptr();
1388 switch (jmp) {
1389 case Jump::ja: ASMNCF(JA, FE_JMPL, target); break;
1390 case Jump::jae: ASMNCF(JNC, FE_JMPL, target); break;
1391 case Jump::jb: ASMNCF(JC, FE_JMPL, target); break;
1392 case Jump::jbe: ASMNCF(JBE, FE_JMPL, target); break;
1393 case Jump::je: ASMNCF(JZ, FE_JMPL, target); break;
1394 case Jump::jg: ASMNCF(JG, FE_JMPL, target); break;
1395 case Jump::jge: ASMNCF(JGE, FE_JMPL, target); break;
1396 case Jump::jl: ASMNCF(JL, FE_JMPL, target); break;
1397 case Jump::jle: ASMNCF(JLE, FE_JMPL, target); break;
1398 case Jump::jmp: ASMNCF(JMP, FE_JMPL, target); break;
1399 case Jump::jne: ASMNCF(JNZ, FE_JMPL, target); break;
1400 case Jump::jno: ASMNCF(JNO, FE_JMPL, target); break;
1401 case Jump::jo: ASMNCF(JO, FE_JMPL, target); break;
1402 case Jump::js: ASMNCF(JS, FE_JMPL, target); break;
1403 case Jump::jns: ASMNCF(JNS, FE_JMPL, target); break;
1404 case Jump::jp: ASMNCF(JP, FE_JMPL, target); break;
1405 case Jump::jnp: ASMNCF(JNP, FE_JMPL, target); break;
1406 }
1407
1408 this->assembler.add_unresolved_entry(
1409 target_label,
1410 this->text_writer.get_sec_ref(),
1411 this->text_writer.offset() - 4,
1412 Assembler::UnresolvedEntryKind::JMP_OR_MEM_DISP);
1413 } else {
1414 this->text_writer.ensure_space(6);
1415 auto *target = this->text_writer.begin_ptr() +
1416 this->assembler.label_offset(target_label);
1417 switch (jmp) {
1418 case Jump::ja: ASMNC(JA, target); break;
1419 case Jump::jae: ASMNC(JNC, target); break;
1420 case Jump::jb: ASMNC(JC, target); break;
1421 case Jump::jbe: ASMNC(JBE, target); break;
1422 case Jump::je: ASMNC(JZ, target); break;
1423 case Jump::jg: ASMNC(JG, target); break;
1424 case Jump::jge: ASMNC(JGE, target); break;
1425 case Jump::jl: ASMNC(JL, target); break;
1426 case Jump::jle: ASMNC(JLE, target); break;
1427 case Jump::jmp: ASMNC(JMP, target); break;
1428 case Jump::jne: ASMNC(JNZ, target); break;
1429 case Jump::jno: ASMNC(JNO, target); break;
1430 case Jump::jo: ASMNC(JO, target); break;
1431 case Jump::js: ASMNC(JS, target); break;
1432 case Jump::jns: ASMNC(JNS, target); break;
1433 case Jump::jp: ASMNC(JP, target); break;
1434 case Jump::jnp: ASMNC(JNP, target); break;
1435 }
1436 }
1437}
1438
1439template <IRAdaptor Adaptor,
1440 typename Derived,
1441 template <typename, typename, typename> class BaseTy,
1442 typename Config>
1443void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1444 Jump cc, AsmReg dst) noexcept {
1445 ASM(MOV32ri, dst, 0);
1446 switch (cc) {
1447 case Jump::ja: ASM(SETA8r, dst); break;
1448 case Jump::jae: ASM(SETNC8r, dst); break;
1449 case Jump::jb: ASM(SETC8r, dst); break;
1450 case Jump::jbe: ASM(SETBE8r, dst); break;
1451 case Jump::je: ASM(SETZ8r, dst); break;
1452 case Jump::jg: ASM(SETG8r, dst); break;
1453 case Jump::jge: ASM(SETGE8r, dst); break;
1454 case Jump::jl: ASM(SETL8r, dst); break;
1455 case Jump::jle: ASM(SETLE8r, dst); break;
1456 case Jump::jmp: ASM(MOV32ri, dst, 1); break;
1457 case Jump::jne: ASM(SETNZ8r, dst); break;
1458 case Jump::jno: ASM(SETNO8r, dst); break;
1459 case Jump::jo: ASM(SETO8r, dst); break;
1460 case Jump::js: ASM(SETS8r, dst); break;
1461 case Jump::jns: ASM(SETNS8r, dst); break;
1462 case Jump::jp: ASM(SETP8r, dst); break;
1463 case Jump::jnp: ASM(SETNP8r, dst); break;
1464 }
1465}
1466
1467template <IRAdaptor Adaptor,
1468 typename Derived,
1469 template <typename, typename, typename> class BaseTy,
1470 typename Config>
1471void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1472 Jump cc, AsmReg dst) noexcept {
1473 // TODO: use sbb dst,dst/adc dest,-1 for carry flag
1474 generate_raw_set(cc, dst);
1475 ASM(NEG64r, dst);
1476}
1477template <IRAdaptor Adaptor,
1478 typename Derived,
1479 template <typename, typename, typename> class BaseTy,
1480 typename Config>
1481void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_cmov(
1482 Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept {
1483 this->text_writer.ensure_space(16);
1484 if (is_64) {
1485 switch (cc) {
1486 case Jump::ja: ASMNC(CMOVA64rr, dst, src); break;
1487 case Jump::jae: ASMNC(CMOVNC64rr, dst, src); break;
1488 case Jump::jb: ASMNC(CMOVC64rr, dst, src); break;
1489 case Jump::jbe: ASMNC(CMOVBE64rr, dst, src); break;
1490 case Jump::je: ASMNC(CMOVZ64rr, dst, src); break;
1491 case Jump::jg: ASMNC(CMOVG64rr, dst, src); break;
1492 case Jump::jge: ASMNC(CMOVGE64rr, dst, src); break;
1493 case Jump::jl: ASMNC(CMOVL64rr, dst, src); break;
1494 case Jump::jle: ASMNC(CMOVLE64rr, dst, src); break;
1495 case Jump::jmp: ASMNC(MOV64rr, dst, src); break;
1496 case Jump::jne: ASMNC(CMOVNZ64rr, dst, src); break;
1497 case Jump::jno: ASMNC(CMOVNO64rr, dst, src); break;
1498 case Jump::jo: ASMNC(CMOVO64rr, dst, src); break;
1499 case Jump::js: ASMNC(CMOVS64rr, dst, src); break;
1500 case Jump::jns: ASMNC(CMOVNS64rr, dst, src); break;
1501 case Jump::jp: ASMNC(CMOVP64rr, dst, src); break;
1502 case Jump::jnp: ASMNC(CMOVNP64rr, dst, src); break;
1503 }
1504 } else {
1505 switch (cc) {
1506 case Jump::ja: ASMNC(CMOVA32rr, dst, src); break;
1507 case Jump::jae: ASMNC(CMOVNC32rr, dst, src); break;
1508 case Jump::jb: ASMNC(CMOVC32rr, dst, src); break;
1509 case Jump::jbe: ASMNC(CMOVBE32rr, dst, src); break;
1510 case Jump::je: ASMNC(CMOVZ32rr, dst, src); break;
1511 case Jump::jg: ASMNC(CMOVG32rr, dst, src); break;
1512 case Jump::jge: ASMNC(CMOVGE32rr, dst, src); break;
1513 case Jump::jl: ASMNC(CMOVL32rr, dst, src); break;
1514 case Jump::jle: ASMNC(CMOVLE32rr, dst, src); break;
1515 case Jump::jmp: ASMNC(MOV32rr, dst, src); break;
1516 case Jump::jne: ASMNC(CMOVNZ32rr, dst, src); break;
1517 case Jump::jno: ASMNC(CMOVNO32rr, dst, src); break;
1518 case Jump::jo: ASMNC(CMOVO32rr, dst, src); break;
1519 case Jump::js: ASMNC(CMOVS32rr, dst, src); break;
1520 case Jump::jns: ASMNC(CMOVNS32rr, dst, src); break;
1521 case Jump::jp: ASMNC(CMOVP32rr, dst, src); break;
1522 case Jump::jnp: ASMNC(CMOVNP32rr, dst, src); break;
1523 }
1524 }
1525}
1526
1527template <IRAdaptor Adaptor,
1528 typename Derived,
1529 template <typename, typename, typename> class BaseTy,
1530 typename Config>
1531void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1532 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1533 assert(from < to && to <= 64);
1534 if (!sign) {
1535 switch (from) {
1536 case 8: ASM(MOVZXr32r8, dst, src); break;
1537 case 16: ASM(MOVZXr32r16, dst, src); break;
1538 case 32: ASM(MOV32rr, dst, src); break;
1539 default:
1540 if (from < 32) {
1541 if (dst != src) {
1542 ASM(MOV32rr, dst, src);
1543 }
1544 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1545 } else if (dst != src) {
1546 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1547 ASM(AND64rr, dst, src);
1548 } else {
1549 ScratchReg tmp{this};
1550 AsmReg tmp_reg = tmp.alloc_gp();
1551 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1552 ASM(AND64rr, dst, tmp_reg);
1553 }
1554 }
1555 } else if (to <= 32) {
1556 switch (from) {
1557 case 8: ASM(MOVSXr32r8, dst, src); break;
1558 case 16: ASM(MOVSXr32r16, dst, src); break;
1559 default:
1560 if (dst != src) {
1561 ASM(MOV32rr, dst, src);
1562 }
1563 ASM(SHL32ri, dst, 32 - from);
1564 ASM(SAR32ri, dst, 32 - from);
1565 }
1566 } else {
1567 switch (from) {
1568 case 8: ASM(MOVSXr64r8, dst, src); break;
1569 case 16: ASM(MOVSXr64r16, dst, src); break;
1570 case 32: ASM(MOVSXr64r32, dst, src); break;
1571 default:
1572 if (dst != src) {
1573 ASM(MOV64rr, dst, src);
1574 }
1575 ASM(SHL64ri, dst, 64 - from);
1576 ASM(SAR64ri, dst, 64 - from);
1577 }
1578 }
1579}
1580
1581template <IRAdaptor Adaptor,
1582 typename Derived,
1583 template <typename, typename, typename> class BaseTy,
1584 typename Config>
1585void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1586 set_stack_used() noexcept {
1587 if (stack_adjust_off == 0) {
1588 stack_adjust_off = this->compiler.text_writer.offset();
1589 // Always use 32-bit immediate
1590 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1591 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1592 }
1593}
1594
1595template <IRAdaptor Adaptor,
1596 typename Derived,
1597 template <typename, typename, typename> class BaseTy,
1598 typename Config>
1599void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1600 ValuePart &vp, CCAssignment &cca) noexcept {
1601 AsmReg ptr = vp.load_to_reg(&this->compiler);
1602 ScratchReg scratch{&this->compiler};
1603 AsmReg tmp = scratch.alloc_gp();
1604
1605 auto size = cca.byval_size;
1606 set_stack_used();
1607 i32 off = 0;
1608 while (size >= 8) {
1609 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1610 ASMC(&this->compiler,
1611 MOV64mr,
1612 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1613 tmp);
1614 off += 8;
1615 size -= 8;
1616 }
1617 if (size >= 4) {
1618 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1619 ASMC(&this->compiler,
1620 MOV32mr,
1621 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1622 tmp);
1623 off += 4;
1624 size -= 4;
1625 }
1626 if (size >= 2) {
1627 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1628 ASMC(&this->compiler,
1629 MOV16mr,
1630 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1631 tmp);
1632 off += 2;
1633 size -= 2;
1634 }
1635 if (size >= 1) {
1636 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1637 ASMC(&this->compiler,
1638 MOV8mr,
1639 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1640 tmp);
1641 }
1642}
1643
1644template <IRAdaptor Adaptor,
1645 typename Derived,
1646 template <typename, typename, typename> class BaseTy,
1647 typename Config>
1648void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1649 ValuePart &vp, CCAssignment &cca) noexcept {
1650 set_stack_used();
1651
1652 auto reg = vp.load_to_reg(&this->compiler);
1653 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1654 switch (cca.size) {
1655 case 1:
1656 ASMC(&this->compiler,
1657 MOV8mr,
1658 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1659 reg);
1660 break;
1661 case 2:
1662 ASMC(&this->compiler,
1663 MOV16mr,
1664 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1665 reg);
1666 break;
1667 case 4:
1668 ASMC(&this->compiler,
1669 MOV32mr,
1670 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1671 reg);
1672 break;
1673 case 8:
1674 ASMC(&this->compiler,
1675 MOV64mr,
1676 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1677 reg);
1678 break;
1679 default: TPDE_UNREACHABLE("invalid GP reg size");
1680 }
1681 } else {
1682 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1683 switch (cca.size) {
1684 case 4:
1685 ASMC(&this->compiler,
1686 SSE_MOVSSmr,
1687 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1688 reg);
1689 break;
1690 case 8:
1691 ASMC(&this->compiler,
1692 SSE_MOVSDmr,
1693 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1694 reg);
1695 break;
1696 case 16:
1697 ASMC(&this->compiler,
1698 SSE_MOVDQAmr,
1699 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1700 reg);
1701 break;
1702 default: TPDE_UNREACHABLE("invalid GP reg size");
1703 }
1704 }
1705}
1706
1707template <IRAdaptor Adaptor,
1708 typename Derived,
1709 template <typename, typename, typename> class BaseTy,
1710 typename Config>
1711void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1712 std::variant<typename Assembler::SymRef, ValuePart> &&target) noexcept {
1713 if (this->assigner.is_vararg()) {
1714 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1715 this->compiler.evict_reg(Reg{AsmReg::AX});
1716 }
1717 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1718 Config::FP_BANK, 0);
1719 unsigned xmm_cnt = 8;
1720 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1721 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1722 }
1723 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1724 }
1725
1726 u32 sub = 0;
1727 if (stack_adjust_off != 0) {
1728 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1729 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1730 memcpy(inst_ptr + 3, &sub, sizeof(u32));
1731 } else {
1732 assert(this->assigner.get_stack_size() == 0);
1733 }
1734
1735 if (auto *sym = std::get_if<typename Assembler::SymRef>(&target)) {
1736 this->compiler.text_writer.ensure_space(16);
1737 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1738 this->compiler.reloc_text(
1739 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1740 } else {
1741 ValuePart &tvp = std::get<ValuePart>(target);
1742 if (AsmReg reg = tvp.cur_reg_unlocked(); reg.valid()) {
1743 ASMC(&this->compiler, CALLr, reg);
1744 } else if (tvp.has_assignment() && tvp.assignment().stack_valid()) {
1745 auto off = tvp.assignment().frame_off();
1746 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1747 } else {
1748 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1749 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1750 ASMC(&this->compiler, CALLr, reg);
1751 }
1752 tvp.reset(&this->compiler);
1753 }
1754
1755 if (stack_adjust_off != 0) {
1756 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1757 }
1758}
1759
1760template <IRAdaptor Adaptor,
1761 typename Derived,
1762 template <typename, typename, typename> typename BaseTy,
1763 typename Config>
1764void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1765 std::variant<Assembler::SymRef, ValuePart> &&target,
1766 std::span<CallArg> arguments,
1767 typename Base::ValueRef *result,
1768 const bool variable_args) {
1769 CCAssignerSysV assigner{variable_args};
1770 CallBuilder cb{*derived(), assigner};
1771 for (auto &arg : arguments) {
1772 cb.add_arg(std::move(arg));
1773 }
1774 cb.call(std::move(target));
1775 if (result) {
1776 cb.add_ret(*result);
1777 }
1778}
1779
1780template <IRAdaptor Adaptor,
1781 typename Derived,
1782 template <typename, typename, typename> typename BaseTy,
1783 typename Config>
1784CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1785 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1786 Assembler::SymRef sym, TLSModel model) noexcept {
1787 switch (model) {
1788 default: // TODO: implement optimized access for non-gd-model
1789 case TLSModel::GlobalDynamic: {
1790 // Generate function call to __tls_get_addr; on x86-64, this takes a single
1791 // parameter in rdi.
1792 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1793 for (auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1794 this->evict_reg(Reg{reg});
1795 }
1796 ScratchReg arg{this};
1797 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1798
1799 // Call sequence with extra prefixes for linker relaxation. Code sequence
1800 // taken from "ELF Handling For Thread-Local Storage".
1801 this->text_writer.ensure_space(0x10);
1802 *this->text_writer.cur_ptr()++ = 0x66;
1803 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1804 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1805 *this->text_writer.cur_ptr()++ = 0x66;
1806 *this->text_writer.cur_ptr()++ = 0x66;
1807 *this->text_writer.cur_ptr()++ = 0x48;
1808 ASMNC(CALL, this->text_writer.cur_ptr());
1809 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1810 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1811 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
1812 }
1813 this->reloc_text(this->sym_tls_get_addr,
1814 R_X86_64_PLT32,
1815 this->text_writer.offset() - 4,
1816 -4);
1817 arg.reset();
1818
1819 ScratchReg res{this};
1820 res.alloc_specific(AsmReg::AX);
1821 return res;
1822 }
1823 }
1824}
1825
1826} // namespace tpde::x64