TPDE
Loading...
Searching...
No Matches
CompilerX64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/base.hpp"
10#include "tpde/x64/FunctionWriterX64.hpp"
11
12#include <bit>
13
14#ifdef TPDE_ASSERTS
15 #include <fadec.h>
16#endif
17
18// Helper macros for assembling in the compiler
19#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
20 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
21#endif
22
23// Use helper, parameters might call ASM themselves => evaluate text_cur_ptr
24// after the arguments.
25#define ASM_FULL(compiler, reserve, op, ...) \
26 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
27
28#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
29#define ASMC(compiler, op, ...) \
30 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
31#define ASMF(op, flag, ...) \
32 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
33#define ASMNCF(op, flag, ...) \
34 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
35#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
36
37namespace tpde::x64 {
38
39struct AsmReg : Reg {
40 enum REG : u8 {
41 AX = 0,
42 CX,
43 DX,
44 BX,
45 SP,
46 BP,
47 SI,
48 DI,
49 R8,
50 R9,
51 R10,
52 R11,
53 R12,
54 R13,
55 R14,
56 R15,
57
58 XMM0 = 32,
59 XMM1,
60 XMM2,
61 XMM3,
62 XMM4,
63 XMM5,
64 XMM6,
65 XMM7,
66 XMM8,
67 XMM9,
68 XMM10,
69 XMM11,
70 XMM12,
71 XMM13,
72 XMM14,
73 XMM15,
74 // TODO(ts): optional support for AVX registers with compiler flag
75 };
76
77 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
78
79 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
80
81 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
82
83 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
84 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
85 }
86
87 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
88 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
89 }
90
91 constexpr operator FeRegGP() const noexcept {
92 assert(reg_id <= R15);
93 return FeRegGP{reg_id};
94 }
95
96 operator FeRegGPLH() const noexcept {
97 assert(reg_id <= R15);
98 return FeRegGP{reg_id};
99 }
100
101 constexpr operator FeRegXMM() const noexcept {
102 assert(reg_id >= XMM0 && reg_id <= XMM15);
103 return FeRegXMM{static_cast<u8>(reg_id & 0x1F)};
104 }
105};
106
107constexpr static u64
108 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
109 u64 set = 0;
110 for (const auto reg : regs) {
111 set |= 1ull << reg;
112 }
113 return set;
114}
115
116template <size_t N>
117constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
118 u64 set = 0;
119 for (const auto reg : regs) {
120 set |= 1ull << reg.id();
121 }
122 return set;
123}
124
125class CCAssignerSysV : public CCAssigner {
126public:
127 static constexpr CCInfo Info{
128 .allocatable_regs =
129 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
130 .callee_saved_regs = create_bitmask({
131 AsmReg::BX,
132 AsmReg::R12,
133 AsmReg::R13,
134 AsmReg::R14,
135 AsmReg::R15,
136 }),
137 .arg_regs = create_bitmask({
138 AsmReg::DI,
139 AsmReg::SI,
140 AsmReg::DX,
141 AsmReg::CX,
142 AsmReg::R8,
143 AsmReg::R9,
144 AsmReg::XMM0,
145 AsmReg::XMM1,
146 AsmReg::XMM2,
147 AsmReg::XMM3,
148 AsmReg::XMM4,
149 AsmReg::XMM5,
150 AsmReg::XMM6,
151 AsmReg::XMM7,
152 }),
153 };
154
155private:
156 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
157 // The next N assignments must go to the stack.
158 unsigned must_assign_stack = 0;
159 bool vararg;
160 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
161
162public:
163 CCAssignerSysV(bool vararg = false) noexcept
164 : CCAssigner(Info), vararg(vararg) {}
165
166 void reset() noexcept override {
167 gp_cnt = xmm_cnt = stack = 0;
168 must_assign_stack = 0;
169 vararg = false;
170 ret_gp_cnt = ret_xmm_cnt = 0;
171 }
172
173 void assign_arg(CCAssignment &arg) noexcept override {
174 if (arg.byval) {
175 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
176 arg.stack_off = stack;
177 stack += arg.size;
178 return;
179 }
180
181 if (arg.bank == RegBank{0}) {
182 static constexpr std::array<AsmReg, 6> gp_arg_regs{
183 AsmReg::DI,
184 AsmReg::SI,
185 AsmReg::DX,
186 AsmReg::CX,
187 AsmReg::R8,
188 AsmReg::R9,
189 };
190 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
191 arg.reg = gp_arg_regs[gp_cnt];
192 gp_cnt += 1;
193 } else {
194 // Next N arguments must also be assigned to the stack
195 // Increment by one, the value is immediately decremented below.
196 must_assign_stack = arg.consecutive + 1;
197 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
198 arg.stack_off = stack;
199 stack += 8;
200 }
201 } else {
202 if (!must_assign_stack && xmm_cnt < 8) {
203 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
204 xmm_cnt += 1;
205 } else {
206 // Next N arguments must also be assigned to the stack
207 // Increment by one, the value is immediately decremented below.
208 must_assign_stack = arg.consecutive + 1;
209 u32 size = util::align_up(arg.size, 8);
210 stack = util::align_up(stack, size);
211 arg.stack_off = stack;
212 stack += size;
213 }
214 }
215
216 if (must_assign_stack > 0) {
217 must_assign_stack -= 1;
218 }
219 }
220
221 u32 get_stack_size() noexcept override { return stack; }
222
223 bool is_vararg() const noexcept override { return vararg; }
224
225 void assign_ret(CCAssignment &arg) noexcept override {
226 assert(!arg.byval && !arg.sret);
227 if (arg.bank == RegBank{0}) {
228 if (ret_gp_cnt + arg.consecutive < 2) {
229 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
230 ret_gp_cnt += 1;
231 } else {
232 assert(false);
233 }
234 } else {
235 if (ret_xmm_cnt + arg.consecutive < 2) {
236 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
237 ret_xmm_cnt += 1;
238 } else {
239 assert(false);
240 }
241 }
242 }
243};
244
245struct PlatformConfig : CompilerConfigDefault {
246 using Assembler = AssemblerElfX64;
247 using AsmReg = tpde::x64::AsmReg;
248 using DefaultCCAssigner = CCAssignerSysV;
249 using FunctionWriter = FunctionWriterX64;
250
251 static constexpr RegBank GP_BANK{0};
252 static constexpr RegBank FP_BANK{1};
253 static constexpr bool FRAME_INDEXING_NEGATIVE = true;
254 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
255 static constexpr u32 NUM_BANKS = 2;
256};
257
258namespace concepts {
259template <typename T, typename Config>
260concept Compiler = tpde::Compiler<T, Config> && requires(T a) {
261 {
262 a.arg_is_int128(std::declval<typename T::IRValueRef>())
263 } -> std::convertible_to<bool>;
264
265 {
266 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
267 } -> std::convertible_to<bool>;
268};
269} // namespace concepts
270
271template <IRAdaptor Adaptor,
272 typename Derived,
273 template <typename, typename, typename> typename BaseTy =
274 CompilerBase,
275 typename Config = PlatformConfig>
276struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
277 using Base = BaseTy<Adaptor, Derived, Config>;
278
279 using IRValueRef = typename Base::IRValueRef;
280 using IRBlockRef = typename Base::IRBlockRef;
281 using IRFuncRef = typename Base::IRFuncRef;
282
283 using ScratchReg = typename Base::ScratchReg;
284 using ValuePartRef = typename Base::ValuePartRef;
285 using ValuePart = typename Base::ValuePart;
286 using GenericValuePart = typename Base::GenericValuePart;
287
288 using Assembler = typename PlatformConfig::Assembler;
289 using RegisterFile = typename Base::RegisterFile;
290
291 using CallArg = typename Base::CallArg;
292
293 using Base::derived;
294
295
296 // TODO(ts): make this dependent on the number of callee-saved regs of the
297 // current function or if there is a call in the function?
298 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
299 6};
300
301 enum CPU_FEATURES : u32 {
302 CPU_BASELINE = 0, // x86-64-v1
303 CPU_CMPXCHG16B = (1 << 0),
304 CPU_POPCNT = (1 << 1),
305 CPU_SSE3 = (1 << 2),
306 CPU_SSSE3 = (1 << 3),
307 CPU_SSE4_1 = (1 << 4),
308 CPU_SSE4_2 = (1 << 5),
309 CPU_AVX = (1 << 6),
310 CPU_AVX2 = (1 << 7),
311 CPU_BMI1 = (1 << 8),
312 CPU_BMI2 = (1 << 9),
313 CPU_F16C = (1 << 10),
314 CPU_FMA = (1 << 11),
315 CPU_LZCNT = (1 << 12),
316 CPU_MOVBE = (1 << 13),
317 CPU_AVX512F = (1 << 14),
318 CPU_AVX512BW = (1 << 15),
319 CPU_AVX512CD = (1 << 16),
320 CPU_AVX512DQ = (1 << 17),
321 CPU_AVX512VL = (1 << 18),
322
323 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
324 CPU_SSE4_1 | CPU_SSE4_2,
325 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
326 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
327 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
328 CPU_AVX512VL,
329 };
330
331 CPU_FEATURES cpu_feats = CPU_BASELINE;
332
333 // When handling function arguments, we need to prevent argument registers
334 // from being handed out as fixed registers
335 //
336 // Additionally, for now we prevent AX,DX,CX to be fixed to not run into
337 // issues with instructions that need them as implicit arguments
338 // also AX and DX can never be fixed if exception handling is used
339 // since they are clobbered there
340 u64 fixed_assignment_nonallocatable_mask =
341 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
342 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
343 func_reg_restore_alloc = 0u;
344 /// Offset to the `sub rsp, XXX` instruction that sets up the frame
345 u32 frame_size_setup_offset = 0u;
346 /// For vararg functions only: number of scalar and xmm registers used.
347 // TODO: this information should be obtained from the CCAssigner.
348 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
349 u32 reg_save_frame_off = 0;
350 u32 var_arg_stack_off = 0;
351 util::SmallVector<u32, 8> func_ret_offs = {};
352
353 /// Symbol for __tls_get_addr.
354 SymRef sym_tls_get_addr;
355
356 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
357 u32 stack_adjust_off = 0;
358
359 void set_stack_used() noexcept;
360
361 public:
362 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
363 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
364
365 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
366 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
367 void call_impl(std::variant<SymRef, ValuePart> &&target) noexcept;
368 void reset_stack() noexcept;
369 };
370
371 // for now, always generate an object
372 explicit CompilerX64(Adaptor *adaptor,
373 const CPU_FEATURES cpu_features = CPU_BASELINE)
374 : Base{adaptor}, cpu_feats(cpu_features) {
375 static_assert(std::is_base_of_v<CompilerX64, Derived>);
376 static_assert(concepts::Compiler<Derived, PlatformConfig>);
377 }
378
379 template <typename... Args>
380 auto asm_helper(unsigned (*enc_fn)(u8 *, int, Args...)) {
381 struct Helper {
382 CompilerX64 *compiler;
383 decltype(enc_fn) fn;
384 void encode(unsigned reserve, int flags, Args... args) {
385 if (reserve) {
386 compiler->text_writer.ensure_space(reserve);
387 }
388 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
389 assert(n != 0);
390 compiler->text_writer.cur_ptr() += n;
391 }
392 };
393 return Helper{this, enc_fn};
394 }
395
396 void start_func(u32 func_idx) noexcept;
397
398 void gen_func_prolog_and_args(CCAssigner *) noexcept;
399
400 void finish_func(u32 func_idx) noexcept;
401
402 void reset() noexcept;
403
404 // helpers
405
406 void gen_func_epilog() noexcept;
407
408 void
409 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
410
411 void load_from_stack(AsmReg dst,
412 i32 frame_off,
413 u32 size,
414 bool sign_extend = false) noexcept;
415
416 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
417
418 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
419
420 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
421 assert(ap.stack_valid() && !ap.variable_ref());
422 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
423 }
424
425 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
426
427 /// Dynamic alloca of a fixed-size region.
428 void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept;
429
430 /// Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
431 /// count must have a size of 64 bit.
432 void alloca_dynamic(u64 elem_size,
433 ValuePart &&count,
434 u32 align,
435 ValuePart &res) noexcept;
436
437 void materialize_constant(const u64 *data,
438 RegBank bank,
439 u32 size,
440 AsmReg dst) noexcept;
441
442 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef) noexcept;
443
444 /// Jump conditions.
445 enum class Jump {
446 jo = 0, ///< Jump if overflow (OF=1).
447 jno, ///< Jump if not overflow (OF=0).
448 jb, ///< Jump if below/if carry (CF=1).
449 jae, ///< Jump if above or equal/if not carry (CF=0).
450 je, ///< Jump if equal/if zero (ZF=1).
451 jne, ///< Jump if not equal/if not zero (ZF=0).
452 jbe, ///< Jump if below or equal (CF=1 or ZF=1).
453 ja, ///< Jump if above (CF=0 and ZF=0).
454 js, ///< Jump if sign (SF=1).
455 jns, ///< Jump if not sign (SF=0).
456 jp, ///< Jump if parity even (PF=1).
457 jnp, ///< Jump if parity odd (PF=0).
458 jl, ///< Jump if less (SF!=OF).
459 jge, ///< Jump if greater or equal (SF=OF).
460 jle, ///< Jump if less or equal (ZF=1 or SF!=OF).
461 jg, ///< Jump if greater (ZF=0 and SF=OF).
462 jmp, ///< Unconditional jump
463 };
464
465 Jump invert_jump(Jump jmp) noexcept;
466 Jump swap_jump(Jump jmp) noexcept;
467
468 FeCond jump_to_cond(Jump jmp) noexcept;
469
470 void generate_branch_to_block(Jump jmp,
471 IRBlockRef target,
472 bool needs_split,
473 bool last_inst) noexcept;
474
475 void generate_raw_jump(Jump jmp, Label target) noexcept;
476
477 /// Set dst to 1 if cc is true, otherwise set it to zero. If zext is false,
478 /// only the lowest 8 bit are set. Flags are not clobbered.
479 void generate_raw_set(Jump cc, AsmReg dst, bool zext = true) noexcept;
480 /// Set all bits of dst to 1 if cc is true, otherwise set it to zero
481 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
482 /// Move src into dst if cc is true, otherwise do nothing
483 void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept;
484
485 void generate_raw_intext(
486 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
487
488 /// Generate a function call
489 ///
490 /// This will get the arguments into the correct registers according to the
491 /// calling convention, clear non-callee-saved registers from the register
492 /// file (make sure you do not have any fixed assignments left over) and
493 /// fill the result registers (the u8 in the ScratchReg pair indicates the
494 /// register bank)
495 ///
496 /// Targets can be a symbol (call to PLT with relocation), or an indirect
497 /// call to a ValuePart. Result is an optional reference.
498 void generate_call(std::variant<SymRef, ValuePart> &&target,
499 std::span<CallArg> arguments,
500 typename Base::ValueRef *result,
501 bool variable_args = false);
502
503 /// Generate code sequence to load address of sym into a register. This will
504 /// generate a function call for dynamic TLS access models.
505 ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept;
506
507 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
508 return ((cpu_feats & feats) == feats);
509 }
510};
511
512template <IRAdaptor Adaptor,
513 typename Derived,
514 template <typename, typename, typename> class BaseTy,
515 typename Config>
516void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
517 const u32 /*func_idx*/) noexcept {
518 this->text_writer.align(16);
519 this->assembler.except_begin_func();
520}
521
522template <IRAdaptor Adaptor,
523 typename Derived,
524 template <typename, typename, typename> typename BaseTy,
525 typename Config>
526void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
527 CCAssigner *cc_assigner) noexcept {
528 // prologue:
529 // push rbp
530 // mov rbp, rsp
531 // optionally create vararg save-area
532 // reserve space for callee-saved regs
533 // = 1 byte for each of the lower 8 regs and 2
534 // bytes for the higher 8 regs
535 // sub rsp, #<frame_size>+<largest_call_frame_usage>
536
537 // TODO(ts): technically we only need rbp if there
538 // is a dynamic alloca but then we need to make the
539 // frame indexing dynamic in CompilerBase and the
540 // unwind info needs to take the dynamic sub rsp for
541 // calls into account
542
543 func_ret_offs.clear();
544 func_start_off = this->text_writer.offset();
545 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
546
547 const CCInfo &cc_info = cc_assigner->get_ccinfo();
548
549 ASM(PUSHr, FE_BP);
550 ASM(MOV64rr, FE_BP, FE_SP);
551
552 func_reg_save_off = this->text_writer.offset();
553
554 auto csr = cc_info.callee_saved_regs;
555 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
556 "non-gp callee-saved registers not implemented");
557
558 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
559 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
560 // R8 and higher need a REX prefix.
561 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
562 this->stack.frame_size = 8 * (csr_logp + csr_higp);
563
564 this->text_writer.ensure_space(reg_save_size);
565 this->text_writer.cur_ptr() += reg_save_size;
566 func_reg_save_alloc = reg_save_size;
567 // pop uses the same amount of bytes as push
568 func_reg_restore_alloc = reg_save_size;
569
570 // TODO(ts): support larger stack alignments?
571
572 // placeholder for later
573 frame_size_setup_offset = this->text_writer.offset();
574 ASM(SUB64ri, FE_SP, 0x7FFF'FFFF);
575#ifdef TPDE_ASSERTS
576 assert((this->text_writer.offset() - frame_size_setup_offset) == 7);
577#endif
578
579 if (this->adaptor->cur_is_vararg()) {
580 this->stack.frame_size += 6 * 8 + 8 * 16;
581 reg_save_frame_off = this->stack.frame_size;
582 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
583 ASM(MOV64mr, mem, FE_DI);
584 mem.off += 8;
585 ASM(MOV64mr, mem, FE_SI);
586 mem.off += 8;
587 ASM(MOV64mr, mem, FE_DX);
588 mem.off += 8;
589 ASM(MOV64mr, mem, FE_CX);
590 mem.off += 8;
591 ASM(MOV64mr, mem, FE_R8);
592 mem.off += 8;
593 ASM(MOV64mr, mem, FE_R9);
594 auto skip_fp = this->text_writer.label_create();
595 ASM(TEST8rr, FE_AX, FE_AX);
596 generate_raw_jump(Jump::je, skip_fp);
597 mem.off += 8;
598 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
599 mem.off += 16;
600 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
601 mem.off += 16;
602 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
603 mem.off += 16;
604 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
605 mem.off += 16;
606 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
607 mem.off += 16;
608 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
609 mem.off += 16;
610 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
611 mem.off += 16;
612 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
613 this->label_place(skip_fp);
614 }
615
616 // Temporarily prevent argument registers from being assigned.
617 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
618 "argument registers must also be allocatable");
619 this->register_file.allocatable &= ~cc_info.arg_regs;
620
621 u32 arg_idx = 0;
622 for (const IRValueRef arg : this->adaptor->cur_args()) {
623 derived()->handle_func_arg(
624 arg_idx,
625 arg,
626 [&](ValuePart &&vp, CCAssignment cca) -> std::optional<i32> {
627 if (!cca.byval) {
628 cca.bank = vp.bank();
629 cca.size = vp.part_size();
630 }
631
632 cc_assigner->assign_arg(cca);
633
634 if (cca.reg.valid()) [[likely]] {
635 vp.set_value_reg(this, cca.reg);
636 // Mark register as allocatable as soon as it is assigned. If the
637 // argument is unused, the register will be freed immediately and
638 // can be used for later stack arguments.
639 this->register_file.allocatable |= u64{1} << cca.reg.id();
640 return {};
641 }
642
643 if (vp.is_owned()) {
644 // no need to handle unused arguments
645 return {};
646 }
647
648 if (cca.byval) {
649 // Return byval frame_off.
650 return 0x10 + cca.stack_off;
651 } else {
652 // TODO(ts): maybe allow negative frame offsets for value
653 // assignments so we can simply reference this?
654 // but this probably doesn't work with multi-part values
655 // since the offsets are different
656 AsmReg dst = vp.alloc_reg(this);
657 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
658 }
659 return {};
660 });
661
662 arg_idx += 1;
663 }
664
665 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
666 // TODO: get this from CCAssigner?
667 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
668 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
669 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
670 this->scalar_arg_count = std::popcount(gp_regs);
671 this->vec_arg_count = std::popcount(xmm_regs);
672 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
673 }
674
675 this->register_file.allocatable |= cc_info.arg_regs;
676}
677
678template <IRAdaptor Adaptor,
679 typename Derived,
680 template <typename, typename, typename> typename BaseTy,
681 typename Config>
682void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
683 u32 func_idx) noexcept {
684 // NB: code alignment factor 1, data alignment factor -8.
685 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
686 // push rbp
687 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
688 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
689 this->assembler.eh_write_inst(
690 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
691 // mov rbp, rsp
692 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
693 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
694 dwarf::x64::DW_reg_rbp);
695
696 // Patched below
697 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
698 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
699
700 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
701 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
702 u64 saved_regs = this->register_file.clobbered & csr;
703 u32 num_saved_regs = 0u;
704 for (auto reg : util::BitSetIterator{saved_regs}) {
705 assert(reg <= AsmReg::R15);
706 write_ptr +=
707 fe64_PUSHr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
708 ++num_saved_regs;
709
710 // DWARF register ordering is subtly different from the encoding:
711 // x86 is: ax, cx, dx, bx, sp, bp, si, di, r8, ...
712 // DWARF is: ax, dx, cx, bx, si, di, bp, sp, r8, ...
713 static const u8 gpreg_to_dwarf[] = {
714 dwarf::x64::DW_reg_rax,
715 dwarf::x64::DW_reg_rcx,
716 dwarf::x64::DW_reg_rdx,
717 dwarf::x64::DW_reg_rbx,
718 dwarf::x64::DW_reg_rsp,
719 dwarf::x64::DW_reg_rbp,
720 dwarf::x64::DW_reg_rsi,
721 dwarf::x64::DW_reg_rdi,
722 dwarf::x64::DW_reg_r8,
723 dwarf::x64::DW_reg_r9,
724 dwarf::x64::DW_reg_r10,
725 dwarf::x64::DW_reg_r11,
726 dwarf::x64::DW_reg_r12,
727 dwarf::x64::DW_reg_r13,
728 dwarf::x64::DW_reg_r14,
729 dwarf::x64::DW_reg_r15,
730 };
731 u8 dwarf_reg = gpreg_to_dwarf[reg];
732 auto cfa_off = num_saved_regs + 2;
733 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
734 }
735
736 u32 prologue_size =
737 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
738 assert(prologue_size < 0x44);
739 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
740 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
741
742 // The frame_size contains the reserved frame size so we need to subtract
743 // the stack space we used for the saved registers
744 const auto final_frame_size =
745 util::align_up(this->stack.frame_size, 16) - num_saved_regs * 8;
746 *reinterpret_cast<u32 *>(this->text_writer.begin_ptr() +
747 frame_size_setup_offset + 3) = final_frame_size;
748#ifdef TPDE_ASSERTS
749 FdInstr instr = {};
750 assert(fd_decode(this->text_writer.begin_ptr() + frame_size_setup_offset,
751 7,
752 64,
753 0,
754 &instr) == 7);
755 assert(FD_TYPE(&instr) == FDI_SUB);
756 assert(FD_OP_TYPE(&instr, 0) == FD_OT_REG);
757 assert(FD_OP_TYPE(&instr, 1) == FD_OT_IMM);
758 assert(FD_OP_SIZE(&instr, 0) == 8);
759 assert(FD_OP_SIZE(&instr, 1) == 8);
760 assert(FD_OP_IMM(&instr, 1) == final_frame_size);
761#endif
762
763 // nop out the rest
764 const auto reg_save_end =
765 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
766 assert(reg_save_end >= write_ptr);
767 const u32 nop_len = reg_save_end - write_ptr;
768 if (nop_len) {
769 fe64_NOP(write_ptr, nop_len);
770 }
771
772 auto func_sym = this->func_syms[func_idx];
773 auto func_sec = this->text_writer.get_sec_ref();
774 if (func_ret_offs.empty()) {
775 // TODO(ts): honor cur_needs_unwind_info
776 auto func_size = this->text_writer.offset() - func_start_off;
777 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
778 this->assembler.eh_end_fde(fde_off, func_sym);
779 this->assembler.except_encode_func(func_sym,
780 this->text_writer.label_offsets.data());
781 return;
782 }
783
784 auto *text_data = this->text_writer.begin_ptr();
785 u32 first_ret_off = func_ret_offs[0];
786 u32 ret_size = 0;
787 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc; // add + pop + ret
788 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
789 {
790 write_ptr = text_data + first_ret_off;
791 const auto ret_start = write_ptr;
792 if (this->adaptor->cur_has_dynamic_alloca()) {
793 if (num_saved_regs == 0) {
794 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
795 } else {
796 write_ptr +=
797 fe64_LEA64rm(write_ptr,
798 0,
799 FE_SP,
800 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
801 }
802 } else {
803 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, final_frame_size);
804 }
805 for (auto reg : util::BitSetIterator<true>{saved_regs}) {
806 assert(reg <= AsmReg::R15);
807 write_ptr +=
808 fe64_POPr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
809 }
810 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
811 write_ptr += fe64_RET(write_ptr, 0);
812 ret_size = write_ptr - ret_start;
813 assert(ret_size <= epilogue_size && "function epilogue too long");
814
815 // write NOP for better disassembly
816 if (epilogue_size > ret_size) {
817 fe64_NOP(write_ptr, epilogue_size - ret_size);
818 if (first_ret_off == func_end_ret_off) {
819 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
820 }
821 }
822 }
823
824 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
825 std::memcpy(
826 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
827 if (func_ret_offs[i] == func_end_ret_off) {
828 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
829 }
830 }
831
832 // Do sym_def at the very end; we shorten the function here again, so only at
833 // this point we know the actual size of the function.
834 // TODO(ts): honor cur_needs_unwind_info
835 auto func_size = this->text_writer.offset() - func_start_off;
836 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
837 this->assembler.eh_end_fde(fde_off, func_sym);
838 this->assembler.except_encode_func(func_sym,
839 this->text_writer.label_offsets.data());
840}
841
842template <IRAdaptor Adaptor,
843 typename Derived,
844 template <typename, typename, typename> typename BaseTy,
845 typename Config>
846void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
847 func_ret_offs.clear();
848 sym_tls_get_addr = {};
849 Base::reset();
850}
851
852template <IRAdaptor Adaptor,
853 typename Derived,
854 template <typename, typename, typename> typename BaseTy,
855 typename Config>
856void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
857 // epilogue:
858 // if !func_has_dynamic_alloca:
859 // add rsp, #<frame_size>+<largest_call_frame_usage>
860 // else:
861 // lea rsp, [rbp - <size_of_reg_save_area>]
862 // for each saved reg:
863 // pop <reg>
864 // pop rbp
865 // ret
866 //
867 // however, since we will later patch this, we only
868 // reserve the space for now
869
870 func_ret_offs.push_back(this->text_writer.offset());
871
872 // add reg, imm32
873 // and
874 // lea rsp, [rbp - imm32]
875 // both take 7 bytes
876 u32 epilogue_size =
877 7 + 1 + 1 +
878 func_reg_restore_alloc; // add/lea + pop + ret + size of reg restore
879
880 this->text_writer.ensure_space(epilogue_size);
881 this->text_writer.cur_ptr() += epilogue_size;
882}
883
884template <IRAdaptor Adaptor,
885 typename Derived,
886 template <typename, typename, typename> typename BaseTy,
887 typename Config>
888void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
889 const AsmReg reg, const i32 frame_off, const u32 size) noexcept {
890 this->text_writer.ensure_space(16);
891 assert(frame_off < 0);
892 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
893 if (reg.id() <= AsmReg::R15) {
894 switch (size) {
895 case 1: ASMNC(MOV8mr, mem, reg); break;
896 case 2: ASMNC(MOV16mr, mem, reg); break;
897 case 4: ASMNC(MOV32mr, mem, reg); break;
898 case 8: ASMNC(MOV64mr, mem, reg); break;
899 default: TPDE_UNREACHABLE("invalid spill size");
900 }
901 return;
902 }
903
904 switch (size) {
905 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg); break;
906 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg); break;
907 case 16: ASMNC(SSE_MOVAPDmr, mem, reg); break;
908 default: TPDE_UNREACHABLE("invalid spill size");
909 }
910}
911
912template <IRAdaptor Adaptor,
913 typename Derived,
914 template <typename, typename, typename> typename BaseTy,
915 typename Config>
916void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
917 const AsmReg dst,
918 const i32 frame_off,
919 const u32 size,
920 const bool sign_extend) noexcept {
921 this->text_writer.ensure_space(16);
922 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
923
924 if (dst.id() <= AsmReg::R15) {
925 if (!sign_extend) {
926 switch (size) {
927 case 1: ASMNC(MOVZXr32m8, dst, mem); break;
928 case 2: ASMNC(MOVZXr32m16, dst, mem); break;
929 case 4: ASMNC(MOV32rm, dst, mem); break;
930 case 8: ASMNC(MOV64rm, dst, mem); break;
931 default: TPDE_UNREACHABLE("invalid spill size");
932 }
933 } else {
934 switch (size) {
935 case 1: ASMNC(MOVSXr64m8, dst, mem); break;
936 case 2: ASMNC(MOVSXr64m16, dst, mem); break;
937 case 4: ASMNC(MOVSXr64m32, dst, mem); break;
938 case 8: ASMNC(MOV64rm, dst, mem); break;
939 default: TPDE_UNREACHABLE("invalid spill size");
940 }
941 }
942 return;
943 }
944
945 assert(!sign_extend);
946
947 switch (size) {
948 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem); break;
949 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem); break;
950 case 16: ASMNC(SSE_MOVAPDrm, dst, mem); break;
951 default: TPDE_UNREACHABLE("invalid spill size");
952 }
953}
954
955template <IRAdaptor Adaptor,
956 typename Derived,
957 template <typename, typename, typename> typename BaseTy,
958 typename Config>
959void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
960 const AsmReg dst, const AssignmentPartRef ap) noexcept {
961 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
962}
963
964template <IRAdaptor Adaptor,
965 typename Derived,
966 template <typename, typename, typename> typename BaseTy,
967 typename Config>
968void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
969 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
970 this->text_writer.ensure_space(16);
971 assert(dst.valid());
972 assert(src.valid());
973 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
974 if (size > 4) {
975 ASMNC(MOV64rr, dst, src);
976 } else {
977 ASMNC(MOV32rr, dst, src);
978 }
979 } else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
980 if (size <= 16) {
981 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
982 assert(has_cpu_feats(CPU_AVX512F));
983 ASMNC(VMOVAPD128rr, dst, src);
984 } else {
985 ASMNC(SSE_MOVAPDrr, dst, src);
986 }
987 } else if (size <= 32) {
988 assert(has_cpu_feats(CPU_AVX));
989 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
990 has_cpu_feats(CPU_AVX512F));
991 ASMNC(VMOVAPD256rr, dst, src);
992 } else {
993 assert(size <= 64);
994 assert(has_cpu_feats(CPU_AVX512F));
995 ASMNC(VMOVAPD512rr, dst, src);
996 }
997 } else if (dst.id() <= AsmReg::R15) {
998 // gp<-xmm
999 assert(src.id() >= AsmReg::XMM0);
1000 assert(size <= 8);
1001 if (src.id() > AsmReg::XMM15) {
1002 assert(has_cpu_feats(CPU_AVX512F));
1003 if (size <= 4) {
1004 ASMNC(VMOVD_X2Grr, dst, src);
1005 } else {
1006 ASMNC(VMOVQ_X2Grr, dst, src);
1007 }
1008 } else {
1009 if (size <= 4) {
1010 ASMNC(SSE_MOVD_X2Grr, dst, src);
1011 } else {
1012 ASMNC(SSE_MOVQ_X2Grr, dst, src);
1013 }
1014 }
1015 } else {
1016 // xmm<-gp
1017 assert(src.id() <= AsmReg::R15);
1018 assert(dst.id() >= AsmReg::XMM0);
1019 assert(size <= 8);
1020 if (dst.id() > AsmReg::XMM15) {
1021 assert(has_cpu_feats(CPU_AVX512F));
1022 if (size <= 4) {
1023 ASMNC(VMOVD_G2Xrr, dst, src);
1024 } else {
1025 ASMNC(VMOVQ_G2Xrr, dst, src);
1026 }
1027 } else {
1028 if (size <= 4) {
1029 ASMNC(SSE_MOVD_G2Xrr, dst, src);
1030 } else {
1031 ASMNC(SSE_MOVQ_G2Xrr, dst, src);
1032 }
1033 }
1034 }
1035}
1036
1037template <IRAdaptor Adaptor,
1038 typename Derived,
1039 template <typename, typename, typename> typename BaseTy,
1040 typename Config>
1041AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1042 GenericValuePart &gv) noexcept {
1043 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1044
1045 ScratchReg scratch{derived()};
1046 bool disp32 = i32(expr.disp) == expr.disp;
1047 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1048 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1049 if (std::holds_alternative<ScratchReg>(expr.base)) {
1050 scratch = std::move(std::get<ScratchReg>(expr.base));
1051 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1052 scratch = std::move(std::get<ScratchReg>(expr.index));
1053 } else {
1054 (void)scratch.alloc_gp();
1055 }
1056 auto dst = scratch.cur_reg();
1057 if (idx.valid()) {
1058 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1059 u8 sc = expr.scale;
1060 if (base.valid() && disp32) {
1061 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1062 expr.disp = 0;
1063 } else if (base.valid()) {
1064 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1065 } else if (disp32) {
1066 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1067 } else {
1068 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1069 }
1070 } else {
1071 u64 scale = expr.scale;
1072 if (base == idx) {
1073 base = AsmReg::make_invalid();
1074 scale += 1;
1075 }
1076
1077 ScratchReg idx_scratch{derived()};
1078 // We need a register to compute the scaled index.
1079 AsmReg idx_tmp = dst;
1080 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1081 // We can't use dst, it'd clobber base, so use the other
1082 // register we currently own.
1083 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1084 } else if (dst == base) {
1085 idx_tmp = idx_scratch.alloc_gp();
1086 }
1087
1088 if ((scale & (scale - 1)) == 0) {
1089 if (idx_tmp != idx) {
1090 ASM(MOV64rr, idx_tmp, idx);
1091 }
1092 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1093 } else {
1094 if (i32(scale) == i64(scale)) {
1095 ASM(IMUL64rri, idx_tmp, idx, scale);
1096 } else {
1097 ScratchReg scratch2{derived()};
1098 auto tmp2 = scratch2.alloc_gp();
1099 ASM(MOV64ri, tmp2, scale);
1100 if (idx_tmp != idx) {
1101 ASM(MOV64rr, idx_tmp, idx);
1102 }
1103 ASM(IMUL64rr, idx_tmp, tmp2);
1104 }
1105 }
1106 if (base.valid()) {
1107 if (disp32 || (idx_tmp != dst && base != dst)) {
1108 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1109 expr.disp = 0;
1110 } else if (dst == base) {
1111 ASM(ADD64rr, dst, idx_tmp);
1112 } else {
1113 ASM(ADD64rr, dst, base);
1114 }
1115 }
1116 }
1117 } else if (base.valid()) {
1118 if (expr.disp && disp32) {
1119 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1120 expr.disp = 0;
1121 } else if (dst != base) {
1122 ASM(MOV64rr, dst, base);
1123 }
1124 }
1125 if (expr.disp) {
1126 ScratchReg scratch2{derived()};
1127 auto tmp2 = scratch2.alloc_gp();
1128 ASM(MOV64ri, tmp2, expr.disp);
1129 ASM(ADD64rr, dst, tmp2);
1130 }
1131 gv.state = std::move(scratch);
1132 return dst;
1133}
1134
1135template <IRAdaptor Adaptor,
1136 typename Derived,
1137 template <typename, typename, typename> typename BaseTy,
1138 typename Config>
1139void CompilerX64<Adaptor, Derived, BaseTy, Config>::alloca_fixed(
1140 u64 size, u32 align, ValuePart &res) noexcept {
1141 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1142 size = tpde::util::align_up(size, 16);
1143 if (size > 0) {
1144 assert(size < 0x8000'0000);
1145 ASM(SUB64ri, FE_SP, size);
1146 }
1147 if (align > 16) {
1148 assert(align < u32{1} << 31 && "alignment >= 2**31 not implemented");
1149 ASM(AND64ri, FE_SP, ~(align - 1));
1150 }
1151 ASM(MOV64rr, res.alloc_reg(this), FE_SP);
1152}
1153
1154template <IRAdaptor Adaptor,
1155 typename Derived,
1156 template <typename, typename, typename> typename BaseTy,
1157 typename Config>
1158void CompilerX64<Adaptor, Derived, BaseTy, Config>::alloca_dynamic(
1159 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept {
1160 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1161 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(this);
1162 AsmReg res_reg = res.alloc_try_reuse(this, count);
1163
1164 if (elem_size == 0) {
1165 ASM(XOR32rr, res_reg, res_reg);
1166 } else if ((elem_size & (elem_size - 1)) == 0) {
1167 // elem_size is power of two
1168 const auto shift = util::cnt_tz(elem_size);
1169 if (shift > 0 && shift < 4) {
1170 ASM(LEA64rm, res_reg, FE_MEM(FE_NOREG, u8(1 << shift), size_reg, 0));
1171 } else {
1172 if (size_reg != res_reg) {
1173 ASM(MOV64rr, res_reg, size_reg);
1174 }
1175 if (elem_size != 1) {
1176 ASM(SHL64ri, res_reg, shift);
1177 }
1178 }
1179 } else {
1180 if (elem_size <= 0x7FFF'FFFF) [[likely]] {
1181 ASM(IMUL64rri, res_reg, size_reg, elem_size);
1182 } else {
1183 ScratchReg scratch{this};
1184 auto tmp = scratch.alloc_gp();
1185 ASM(MOV64ri, tmp, elem_size);
1186 if (size_reg != res_reg) {
1187 ASM(MOV64rr, res_reg, size_reg);
1188 }
1189 ASM(IMUL64rr, res_reg, tmp);
1190 }
1191 }
1192
1193 ASM(SUB64rr, FE_SP, res_reg);
1194
1195 align = align > 16 ? align : 16;
1196 if (elem_size & (align - 1)) {
1197 assert(align < u32{1} << 31 && "alignment >= 2**31 not implemented");
1198 ASM(AND64ri, FE_SP, ~(align - 1));
1199 }
1200
1201 ASM(MOV64rr, res_reg, FE_SP);
1202}
1203
1204template <IRAdaptor Adaptor,
1205 typename Derived,
1206 template <typename, typename, typename> typename BaseTy,
1207 typename Config>
1208void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1209 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1210 const auto const_u64 = data[0];
1211 if (bank == Config::GP_BANK) {
1212 assert(size <= 8);
1213 if (const_u64 == 0) {
1214 // note: cannot use XOR here since this might be called in-between
1215 // instructions that rely on the flags being preserved
1216 // ASM(XOR32rr, dst, dst);
1217 ASM(MOV32ri, dst, 0);
1218 return;
1219 }
1220
1221 if (size <= 4 || u32(const_u64) == const_u64) {
1222 ASM(MOV32ri, dst, const_u64);
1223 } else {
1224 ASM(MOV64ri, dst, const_u64);
1225 }
1226 return;
1227 }
1228
1229 assert(bank == Config::FP_BANK);
1230 const auto high_u64 = size <= 8 ? 0 : data[1];
1231 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1232 if (has_cpu_feats(CPU_AVX)) {
1233 ASM(VPXOR128rrr, dst, dst, dst);
1234 } else {
1235 ASM(SSE_PXORrr, dst, dst);
1236 }
1237 return;
1238 }
1239 const u64 ones = -u64{1};
1240 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1241 if (has_cpu_feats(CPU_AVX)) {
1242 ASM(VPCMPEQB128rrr, dst, dst, dst);
1243 } else {
1244 ASM(SSE_PCMPEQBrr, dst, dst);
1245 }
1246 return;
1247 }
1248
1249 if (size <= 8) {
1250 // We must not evict registers here (might be used within branching code),
1251 // so only use free registers and load from memory otherwise.
1252 AsmReg tmp =
1253 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1254 if (tmp.valid()) {
1255 this->register_file.mark_clobbered(tmp);
1256 materialize_constant(data, Config::GP_BANK, size, tmp);
1257 if (size <= 4) {
1258 if (has_cpu_feats(CPU_AVX)) {
1259 ASM(VMOVD_G2Xrr, dst, tmp);
1260 } else {
1261 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1262 }
1263 } else {
1264 if (has_cpu_feats(CPU_AVX)) {
1265 ASM(VMOVQ_G2Xrr, dst, tmp);
1266 } else {
1267 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1268 }
1269 }
1270 return;
1271 }
1272 }
1273
1274 // TODO: round to next power of two but at least 4 byte
1275 // We store constants in 8-byte units.
1276 auto alloc_size = util::align_up(size, 8);
1277 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), alloc_size};
1278 // TODO: deduplicate/pool constants?
1279 auto rodata = this->assembler.get_data_section(true, false);
1280 auto sym = this->assembler.sym_def_data(
1281 rodata, "", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1282 if (size <= 4) {
1283 if (has_cpu_feats(CPU_AVX)) {
1284 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1285 } else {
1286 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1287 }
1288 } else if (size <= 8) {
1289 if (has_cpu_feats(CPU_AVX)) {
1290 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1291 } else {
1292 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1293 }
1294 } else if (size <= 16) {
1295 if (has_cpu_feats(CPU_AVX)) {
1296 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1297 } else {
1298 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1299 }
1300 } else {
1301 // TODO: implement for AVX/AVX-512.
1302 TPDE_FATAL("unable to materialize constant");
1303 }
1304
1305 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1306}
1307
1308template <IRAdaptor Adaptor,
1309 typename Derived,
1310 template <typename, typename, typename> typename BaseTy,
1311 typename Config>
1312AsmReg
1313 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1314 AssignmentPartRef ap, IRValueRef) noexcept {
1315 RegBank bank = ap.bank();
1316 assert(bank.id() <= Config::NUM_BANKS);
1317 auto reg_mask = this->register_file.bank_regs(bank);
1318 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1319
1320 const auto find_possible_regs = [this,
1321 reg_mask](const u64 preferred_regs) -> u64 {
1322 // try to first get an unused reg, otherwise an unfixed reg
1323 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1324 return free_regs & preferred_regs & reg_mask;
1325 };
1326
1327 u64 possible_regs;
1328 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1329 if (derived()->cur_func_may_emit_calls()) {
1330 // we can only allocated fixed assignments from the callee-saved regs
1331 possible_regs = find_possible_regs(csr);
1332 } else {
1333 // try allocating any non-callee saved register first, except the result
1334 // registers
1335 possible_regs = find_possible_regs(~csr);
1336 if (possible_regs == 0) {
1337 // otherwise fallback to callee-saved regs
1338 possible_regs = find_possible_regs(csr);
1339 }
1340 }
1341
1342 if (possible_regs == 0) {
1343 return AsmReg::make_invalid();
1344 }
1345
1346 // try to first get an unused reg, otherwise an unfixed reg
1347 if ((possible_regs & ~this->register_file.used) != 0) {
1348 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1349 }
1350
1351 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1352 const auto reg = AsmReg{reg_id};
1353
1354 if (this->register_file.is_fixed(reg)) {
1355 continue;
1356 }
1357
1358 const auto local_idx = this->register_file.reg_local_idx(reg);
1359 const auto part = this->register_file.reg_part(reg);
1360
1361 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1362 continue;
1363 }
1364 auto *assignment = this->val_assignment(local_idx);
1365 auto ap = AssignmentPartRef{assignment, part};
1366 if (ap.modified()) {
1367 continue;
1368 }
1369
1370 return reg;
1371 }
1372
1373 return AsmReg::make_invalid();
1374}
1375
1376template <IRAdaptor Adaptor,
1377 typename Derived,
1378 template <typename, typename, typename> typename BaseTy,
1379 typename Config>
1380typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1381 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1382 Jump jmp) noexcept {
1383 switch (jmp) {
1384 case Jump::ja: return Jump::jbe;
1385 case Jump::jae: return Jump::jb;
1386 case Jump::jb: return Jump::jae;
1387 case Jump::jbe: return Jump::ja;
1388 case Jump::je: return Jump::jne;
1389 case Jump::jg: return Jump::jle;
1390 case Jump::jge: return Jump::jl;
1391 case Jump::jl: return Jump::jge;
1392 case Jump::jle: return Jump::jg;
1393 case Jump::jne: return Jump::je;
1394 case Jump::jno: return Jump::jo;
1395 case Jump::jo: return Jump::jno;
1396 case Jump::js: return Jump::jns;
1397 case Jump::jns: return Jump::js;
1398 case Jump::jp: return Jump::jnp;
1399 case Jump::jnp: return Jump::jp;
1400 default: TPDE_UNREACHABLE("invalid jump kind for invert_jump");
1401 }
1402}
1403
1404template <IRAdaptor Adaptor,
1405 typename Derived,
1406 template <typename, typename, typename> class BaseTy,
1407 typename Config>
1408typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1409 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1410 Jump jmp) noexcept {
1411 switch (jmp) {
1412 case Jump::ja: return Jump::jb;
1413 case Jump::jae: return Jump::jbe;
1414 case Jump::jb: return Jump::ja;
1415 case Jump::jbe: return Jump::jae;
1416 case Jump::je: return Jump::je;
1417 case Jump::jne: return Jump::jne;
1418 case Jump::jg: return Jump::jl;
1419 case Jump::jge: return Jump::jle;
1420 case Jump::jl: return Jump::jg;
1421 case Jump::jle: return Jump::jge;
1422 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1423 }
1424}
1425
1426template <IRAdaptor Adaptor,
1427 typename Derived,
1428 template <typename, typename, typename> class BaseTy,
1429 typename Config>
1430FeCond CompilerX64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(
1431 Jump jmp) noexcept {
1432 // LLVM won't transform the switch into a shift.
1433 FeCond res = FeCond(u32(jmp) << 16);
1434 switch (jmp) {
1435 case Jump::ja: assert(res == FE_CC_A && "FeCond value mismatch?"); break;
1436 case Jump::jae: assert(res == FE_CC_AE && "FeCond value mismatch?"); break;
1437 case Jump::jb: assert(res == FE_CC_B && "FeCond value mismatch?"); break;
1438 case Jump::jbe: assert(res == FE_CC_BE && "FeCond value mismatch?"); break;
1439 case Jump::je: assert(res == FE_CC_E && "FeCond value mismatch?"); break;
1440 case Jump::jg: assert(res == FE_CC_G && "FeCond value mismatch?"); break;
1441 case Jump::jge: assert(res == FE_CC_GE && "FeCond value mismatch?"); break;
1442 case Jump::jl: assert(res == FE_CC_L && "FeCond value mismatch?"); break;
1443 case Jump::jle: assert(res == FE_CC_LE && "FeCond value mismatch?"); break;
1444 case Jump::jne: assert(res == FE_CC_NE && "FeCond value mismatch?"); break;
1445 case Jump::jno: assert(res == FE_CC_NO && "FeCond value mismatch?"); break;
1446 case Jump::jo: assert(res == FE_CC_O && "FeCond value mismatch?"); break;
1447 case Jump::js: assert(res == FE_CC_S && "FeCond value mismatch?"); break;
1448 case Jump::jns: assert(res == FE_CC_NS && "FeCond value mismatch?"); break;
1449 case Jump::jp: assert(res == FE_CC_P && "FeCond value mismatch?"); break;
1450 case Jump::jnp: assert(res == FE_CC_NP && "FeCond value mismatch?"); break;
1451 default: TPDE_UNREACHABLE("invalid conditional jump");
1452 }
1453 return res;
1454}
1455
1456template <IRAdaptor Adaptor,
1457 typename Derived,
1458 template <typename, typename, typename> typename BaseTy,
1459 typename Config>
1460void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1461 const Jump jmp,
1462 IRBlockRef target,
1463 const bool needs_split,
1464 const bool last_inst) noexcept {
1465 const auto target_idx = this->analyzer.block_idx(target);
1466 if (!needs_split || jmp == Jump::jmp) {
1467 this->derived()->move_to_phi_nodes(target_idx);
1468
1469 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1470 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1471 }
1472 } else {
1473 auto tmp_label = this->text_writer.label_create();
1474 generate_raw_jump(invert_jump(jmp), tmp_label);
1475
1476 this->derived()->move_to_phi_nodes(target_idx);
1477
1478 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1479
1480 this->label_place(tmp_label);
1481 }
1482}
1483
1484template <IRAdaptor Adaptor,
1485 typename Derived,
1486 template <typename, typename, typename> typename BaseTy,
1487 typename Config>
1488void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1489 Jump jmp, Label target_label) noexcept {
1490 this->text_writer.ensure_space(6); // For safe ptr arithmetic on code buffer.
1491 bool pending = this->text_writer.label_is_pending(target_label);
1492 void *target = this->text_writer.cur_ptr();
1493 if (!pending) {
1494 target = this->text_writer.begin_ptr() +
1495 this->text_writer.label_offset(target_label);
1496 }
1497
1498 if (jmp == Jump::jmp) {
1499 ASMNCF(JMP, pending ? FE_JMPL : 0, target);
1500 } else {
1501 ASMNCF(Jcc, (pending ? FE_JMPL : 0) | jump_to_cond(jmp), target);
1502 }
1503
1504 if (pending) {
1505 this->text_writer.label_ref(target_label,
1506 this->text_writer.offset() - 4,
1507 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1508 }
1509}
1510
1511template <IRAdaptor Adaptor,
1512 typename Derived,
1513 template <typename, typename, typename> class BaseTy,
1514 typename Config>
1515void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1516 Jump cc, AsmReg dst, bool zext) noexcept {
1517 if (zext) {
1518 ASM(MOV32ri, dst, 0);
1519 }
1520 ASMF(SETcc8r, jump_to_cond(cc), dst);
1521}
1522
1523template <IRAdaptor Adaptor,
1524 typename Derived,
1525 template <typename, typename, typename> class BaseTy,
1526 typename Config>
1527void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1528 Jump cc, AsmReg dst) noexcept {
1529 // TODO: use sbb dst,dst/adc dest,-1 for carry flag
1530 generate_raw_set(cc, dst);
1531 ASM(NEG64r, dst);
1532}
1533template <IRAdaptor Adaptor,
1534 typename Derived,
1535 template <typename, typename, typename> class BaseTy,
1536 typename Config>
1537void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_cmov(
1538 Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept {
1539 if (is_64) {
1540 ASMF(CMOVcc64rr, jump_to_cond(cc), dst, src);
1541 } else {
1542 ASMF(CMOVcc32rr, jump_to_cond(cc), dst, src);
1543 }
1544}
1545
1546template <IRAdaptor Adaptor,
1547 typename Derived,
1548 template <typename, typename, typename> class BaseTy,
1549 typename Config>
1550void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1551 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1552 assert(from < to && to <= 64);
1553 if (!sign) {
1554 switch (from) {
1555 case 8: ASM(MOVZXr32r8, dst, src); break;
1556 case 16: ASM(MOVZXr32r16, dst, src); break;
1557 case 32: ASM(MOV32rr, dst, src); break;
1558 default:
1559 if (from < 32) {
1560 if (dst != src) {
1561 ASM(MOV32rr, dst, src);
1562 }
1563 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1564 } else if (dst != src) {
1565 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1566 ASM(AND64rr, dst, src);
1567 } else {
1568 ScratchReg tmp{this};
1569 AsmReg tmp_reg = tmp.alloc_gp();
1570 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1571 ASM(AND64rr, dst, tmp_reg);
1572 }
1573 }
1574 } else if (to <= 32) {
1575 switch (from) {
1576 case 8: ASM(MOVSXr32r8, dst, src); break;
1577 case 16: ASM(MOVSXr32r16, dst, src); break;
1578 default:
1579 if (dst != src) {
1580 ASM(MOV32rr, dst, src);
1581 }
1582 ASM(SHL32ri, dst, 32 - from);
1583 ASM(SAR32ri, dst, 32 - from);
1584 }
1585 } else {
1586 switch (from) {
1587 case 8: ASM(MOVSXr64r8, dst, src); break;
1588 case 16: ASM(MOVSXr64r16, dst, src); break;
1589 case 32: ASM(MOVSXr64r32, dst, src); break;
1590 default:
1591 if (dst != src) {
1592 ASM(MOV64rr, dst, src);
1593 }
1594 ASM(SHL64ri, dst, 64 - from);
1595 ASM(SAR64ri, dst, 64 - from);
1596 }
1597 }
1598}
1599
1600template <IRAdaptor Adaptor,
1601 typename Derived,
1602 template <typename, typename, typename> class BaseTy,
1603 typename Config>
1604void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1605 set_stack_used() noexcept {
1606 if (stack_adjust_off == 0) {
1607 stack_adjust_off = this->compiler.text_writer.offset();
1608 // Always use 32-bit immediate
1609 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1610 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1611 }
1612}
1613
1614template <IRAdaptor Adaptor,
1615 typename Derived,
1616 template <typename, typename, typename> class BaseTy,
1617 typename Config>
1618void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1619 ValuePart &vp, CCAssignment &cca) noexcept {
1620 AsmReg ptr = vp.load_to_reg(&this->compiler);
1621 ScratchReg scratch{&this->compiler};
1622 AsmReg tmp = scratch.alloc_gp();
1623
1624 auto size = cca.size;
1625 set_stack_used();
1626 i32 off = 0;
1627 while (size >= 8) {
1628 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1629 ASMC(&this->compiler,
1630 MOV64mr,
1631 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1632 tmp);
1633 off += 8;
1634 size -= 8;
1635 }
1636 if (size >= 4) {
1637 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1638 ASMC(&this->compiler,
1639 MOV32mr,
1640 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1641 tmp);
1642 off += 4;
1643 size -= 4;
1644 }
1645 if (size >= 2) {
1646 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1647 ASMC(&this->compiler,
1648 MOV16mr,
1649 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1650 tmp);
1651 off += 2;
1652 size -= 2;
1653 }
1654 if (size >= 1) {
1655 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1656 ASMC(&this->compiler,
1657 MOV8mr,
1658 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1659 tmp);
1660 }
1661}
1662
1663template <IRAdaptor Adaptor,
1664 typename Derived,
1665 template <typename, typename, typename> class BaseTy,
1666 typename Config>
1667void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1668 ValuePart &vp, CCAssignment &cca) noexcept {
1669 set_stack_used();
1670
1671 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
1672 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1673 switch (cca.size) {
1674 case 1:
1675 ASMC(&this->compiler,
1676 MOV8mr,
1677 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1678 reg);
1679 break;
1680 case 2:
1681 ASMC(&this->compiler,
1682 MOV16mr,
1683 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1684 reg);
1685 break;
1686 case 4:
1687 ASMC(&this->compiler,
1688 MOV32mr,
1689 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1690 reg);
1691 break;
1692 case 8:
1693 ASMC(&this->compiler,
1694 MOV64mr,
1695 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1696 reg);
1697 break;
1698 default: TPDE_UNREACHABLE("invalid GP reg size");
1699 }
1700 } else {
1701 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1702 switch (cca.size) {
1703 case 4:
1704 ASMC(&this->compiler,
1705 SSE_MOVSSmr,
1706 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1707 reg);
1708 break;
1709 case 8:
1710 ASMC(&this->compiler,
1711 SSE_MOVSDmr,
1712 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1713 reg);
1714 break;
1715 case 16:
1716 ASMC(&this->compiler,
1717 SSE_MOVDQAmr,
1718 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1719 reg);
1720 break;
1721 default: TPDE_UNREACHABLE("invalid GP reg size");
1722 }
1723 }
1724}
1725
1726template <IRAdaptor Adaptor,
1727 typename Derived,
1728 template <typename, typename, typename> class BaseTy,
1729 typename Config>
1730void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1731 std::variant<SymRef, ValuePart> &&target) noexcept {
1732 if (this->assigner.is_vararg()) {
1733 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1734 this->compiler.evict_reg(Reg{AsmReg::AX});
1735 }
1736 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1737 Config::FP_BANK, 0);
1738 unsigned xmm_cnt = 8;
1739 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1740 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1741 }
1742 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1743 }
1744
1745 u32 sub = 0;
1746 if (stack_adjust_off != 0) {
1747 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1748 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1749 memcpy(inst_ptr + 3, &sub, sizeof(u32));
1750 } else {
1751 assert(this->assigner.get_stack_size() == 0);
1752 }
1753
1754 if (auto *sym = std::get_if<SymRef>(&target)) {
1755 this->compiler.text_writer.ensure_space(16);
1756 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1757 this->compiler.reloc_text(
1758 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1759 } else {
1760 ValuePart &tvp = std::get<ValuePart>(target);
1761 if (tvp.has_assignment() && !tvp.assignment().register_valid()) {
1762 assert(tvp.assignment().stack_valid());
1763 auto off = tvp.assignment().frame_off();
1764 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1765 } else if (tvp.can_salvage()) {
1766 ASMC(&this->compiler, CALLr, tvp.salvage(&this->compiler));
1767 } else {
1768 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1769 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1770 ASMC(&this->compiler, CALLr, reg);
1771 }
1772 tvp.reset(&this->compiler);
1773 }
1774
1775 if (stack_adjust_off != 0) {
1776 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1777 }
1778}
1779
1780template <IRAdaptor Adaptor,
1781 typename Derived,
1782 template <typename, typename, typename> typename BaseTy,
1783 typename Config>
1784void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1785 std::variant<SymRef, ValuePart> &&target,
1786 std::span<CallArg> arguments,
1787 typename Base::ValueRef *result,
1788 const bool variable_args) {
1789 CCAssignerSysV assigner{variable_args};
1790 CallBuilder cb{*derived(), assigner};
1791 for (auto &arg : arguments) {
1792 cb.add_arg(std::move(arg));
1793 }
1794 cb.call(std::move(target));
1795 if (result) {
1796 cb.add_ret(*result);
1797 }
1798}
1799
1800template <IRAdaptor Adaptor,
1801 typename Derived,
1802 template <typename, typename, typename> typename BaseTy,
1803 typename Config>
1804CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1805 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1806 SymRef sym, TLSModel model) noexcept {
1807 switch (model) {
1808 default: // TODO: implement optimized access for non-gd-model
1809 case TLSModel::GlobalDynamic: {
1810 // Generate function call to __tls_get_addr; on x86-64, this takes a single
1811 // parameter in rdi.
1812 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1813 for (auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1814 this->evict_reg(Reg{reg});
1815 }
1816 ScratchReg arg{this};
1817 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1818
1819 // Call sequence with extra prefixes for linker relaxation. Code sequence
1820 // taken from "ELF Handling For Thread-Local Storage".
1821 this->text_writer.ensure_space(0x10);
1822 *this->text_writer.cur_ptr()++ = 0x66;
1823 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1824 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1825 *this->text_writer.cur_ptr()++ = 0x66;
1826 *this->text_writer.cur_ptr()++ = 0x66;
1827 *this->text_writer.cur_ptr()++ = 0x48;
1828 ASMNC(CALL, this->text_writer.cur_ptr());
1829 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1830 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1831 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
1832 }
1833 this->reloc_text(this->sym_tls_get_addr,
1834 R_X86_64_PLT32,
1835 this->text_writer.offset() - 4,
1836 -4);
1837 arg.reset();
1838
1839 ScratchReg res{this};
1840 res.alloc_specific(AsmReg::AX);
1841 return res;
1842 }
1843 }
1844}
1845
1846} // namespace tpde::x64